Search in sources :

Example 1 with ParallelIndexIngestionSpec

use of org.apache.druid.indexing.common.task.batch.parallel.ParallelIndexIngestionSpec in project druid by druid-io.

the class CompactionTask method createIngestionSchema.

/**
 * Generate {@link ParallelIndexIngestionSpec} from input segments.
 *
 * @return an empty list if input segments don't exist. Otherwise, a generated ingestionSpec.
 */
@VisibleForTesting
static List<ParallelIndexIngestionSpec> createIngestionSchema(final TaskToolbox toolbox, final LockGranularity lockGranularityInUse, final SegmentProvider segmentProvider, final PartitionConfigurationManager partitionConfigurationManager, @Nullable final DimensionsSpec dimensionsSpec, @Nullable final ClientCompactionTaskTransformSpec transformSpec, @Nullable final AggregatorFactory[] metricsSpec, @Nullable final ClientCompactionTaskGranularitySpec granularitySpec, final CoordinatorClient coordinatorClient, final SegmentCacheManagerFactory segmentCacheManagerFactory, final RetryPolicyFactory retryPolicyFactory, final boolean dropExisting) throws IOException, SegmentLoadingException {
    NonnullPair<Map<DataSegment, File>, List<TimelineObjectHolder<String, DataSegment>>> pair = prepareSegments(toolbox, segmentProvider, lockGranularityInUse);
    final Map<DataSegment, File> segmentFileMap = pair.lhs;
    final List<TimelineObjectHolder<String, DataSegment>> timelineSegments = pair.rhs;
    if (timelineSegments.size() == 0) {
        return Collections.emptyList();
    }
    // find metadata for interval
    // queryableIndexAndSegments is sorted by the interval of the dataSegment
    final List<NonnullPair<QueryableIndex, DataSegment>> queryableIndexAndSegments = loadSegments(timelineSegments, segmentFileMap, toolbox.getIndexIO());
    final CompactionTuningConfig compactionTuningConfig = partitionConfigurationManager.computeTuningConfig();
    if (granularitySpec == null || granularitySpec.getSegmentGranularity() == null) {
        // original granularity
        final Map<Interval, List<NonnullPair<QueryableIndex, DataSegment>>> intervalToSegments = new TreeMap<>(Comparators.intervalsByStartThenEnd());
        queryableIndexAndSegments.forEach(p -> intervalToSegments.computeIfAbsent(p.rhs.getInterval(), k -> new ArrayList<>()).add(p));
        // unify overlapping intervals to ensure overlapping segments compacting in the same indexSpec
        List<NonnullPair<Interval, List<NonnullPair<QueryableIndex, DataSegment>>>> intervalToSegmentsUnified = new ArrayList<>();
        Interval union = null;
        List<NonnullPair<QueryableIndex, DataSegment>> segments = new ArrayList<>();
        for (Entry<Interval, List<NonnullPair<QueryableIndex, DataSegment>>> entry : intervalToSegments.entrySet()) {
            Interval cur = entry.getKey();
            if (union == null) {
                union = cur;
                segments.addAll(entry.getValue());
            } else if (union.overlaps(cur)) {
                union = Intervals.utc(union.getStartMillis(), Math.max(union.getEndMillis(), cur.getEndMillis()));
                segments.addAll(entry.getValue());
            } else {
                intervalToSegmentsUnified.add(new NonnullPair<>(union, segments));
                union = cur;
                segments = new ArrayList<>(entry.getValue());
            }
        }
        intervalToSegmentsUnified.add(new NonnullPair<>(union, segments));
        final List<ParallelIndexIngestionSpec> specs = new ArrayList<>(intervalToSegmentsUnified.size());
        for (NonnullPair<Interval, List<NonnullPair<QueryableIndex, DataSegment>>> entry : intervalToSegmentsUnified) {
            final Interval interval = entry.lhs;
            final List<NonnullPair<QueryableIndex, DataSegment>> segmentsToCompact = entry.rhs;
            // If granularitySpec is not null, then set segmentGranularity. Otherwise,
            // creates new granularitySpec and set segmentGranularity
            Granularity segmentGranularityToUse = GranularityType.fromPeriod(interval.toPeriod()).getDefaultGranularity();
            final DataSchema dataSchema = createDataSchema(segmentProvider.dataSource, segmentsToCompact, dimensionsSpec, transformSpec, metricsSpec, granularitySpec == null ? new ClientCompactionTaskGranularitySpec(segmentGranularityToUse, null, null) : granularitySpec.withSegmentGranularity(segmentGranularityToUse));
            specs.add(new ParallelIndexIngestionSpec(dataSchema, createIoConfig(toolbox, dataSchema, interval, coordinatorClient, segmentCacheManagerFactory, retryPolicyFactory, dropExisting), compactionTuningConfig));
        }
        return specs;
    } else {
        // given segment granularity
        final DataSchema dataSchema = createDataSchema(segmentProvider.dataSource, queryableIndexAndSegments, dimensionsSpec, transformSpec, metricsSpec, granularitySpec);
        return Collections.singletonList(new ParallelIndexIngestionSpec(dataSchema, createIoConfig(toolbox, dataSchema, segmentProvider.interval, coordinatorClient, segmentCacheManagerFactory, retryPolicyFactory, dropExisting), compactionTuningConfig));
    }
}
Also used : ArrayList(java.util.ArrayList) LockGranularity(org.apache.druid.indexing.common.LockGranularity) Granularity(org.apache.druid.java.util.common.granularity.Granularity) DataSegment(org.apache.druid.timeline.DataSegment) List(java.util.List) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) NonnullPair(org.apache.druid.java.util.common.NonnullPair) ParallelIndexIngestionSpec(org.apache.druid.indexing.common.task.batch.parallel.ParallelIndexIngestionSpec) TreeMap(java.util.TreeMap) ClientCompactionTaskGranularitySpec(org.apache.druid.client.indexing.ClientCompactionTaskGranularitySpec) DataSchema(org.apache.druid.segment.indexing.DataSchema) TimelineObjectHolder(org.apache.druid.timeline.TimelineObjectHolder) QueryableIndex(org.apache.druid.segment.QueryableIndex) Map(java.util.Map) BiMap(com.google.common.collect.BiMap) HashMap(java.util.HashMap) HashBiMap(com.google.common.collect.HashBiMap) TreeMap(java.util.TreeMap) File(java.io.File) Interval(org.joda.time.Interval) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 2 with ParallelIndexIngestionSpec

use of org.apache.druid.indexing.common.task.batch.parallel.ParallelIndexIngestionSpec in project druid by druid-io.

the class CompactionTaskParallelRunTest method runIndexTask.

private void runIndexTask(@Nullable PartitionsSpec partitionsSpec, boolean appendToExisting) {
    ParallelIndexIOConfig ioConfig = new ParallelIndexIOConfig(null, new LocalInputSource(inputDir, "druid*"), new CsvInputFormat(Arrays.asList("ts", "dim", "val"), "|", null, false, 0), appendToExisting, null);
    ParallelIndexTuningConfig tuningConfig = newTuningConfig(partitionsSpec, 2, !appendToExisting);
    ParallelIndexSupervisorTask indexTask = new ParallelIndexSupervisorTask(null, null, null, new ParallelIndexIngestionSpec(new DataSchema(DATA_SOURCE, new TimestampSpec("ts", "auto", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(Arrays.asList("ts", "dim"))), new AggregatorFactory[] { new LongSumAggregatorFactory("val", "val") }, new UniformGranularitySpec(Granularities.HOUR, Granularities.MINUTE, ImmutableList.of(INTERVAL_TO_INDEX)), null), ioConfig, tuningConfig), null);
    runTask(indexTask);
}
Also used : DataSchema(org.apache.druid.segment.indexing.DataSchema) UniformGranularitySpec(org.apache.druid.segment.indexing.granularity.UniformGranularitySpec) ParallelIndexIOConfig(org.apache.druid.indexing.common.task.batch.parallel.ParallelIndexIOConfig) ParallelIndexSupervisorTask(org.apache.druid.indexing.common.task.batch.parallel.ParallelIndexSupervisorTask) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) LongSumAggregatorFactory(org.apache.druid.query.aggregation.LongSumAggregatorFactory) ParallelIndexIngestionSpec(org.apache.druid.indexing.common.task.batch.parallel.ParallelIndexIngestionSpec) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) CsvInputFormat(org.apache.druid.data.input.impl.CsvInputFormat) ParallelIndexTuningConfig(org.apache.druid.indexing.common.task.batch.parallel.ParallelIndexTuningConfig) LocalInputSource(org.apache.druid.data.input.impl.LocalInputSource)

Example 3 with ParallelIndexIngestionSpec

use of org.apache.druid.indexing.common.task.batch.parallel.ParallelIndexIngestionSpec in project druid by druid-io.

the class CompactionTaskTest method testCreateIngestionSchema.

@Test
public void testCreateIngestionSchema() throws IOException, SegmentLoadingException {
    final List<ParallelIndexIngestionSpec> ingestionSpecs = CompactionTask.createIngestionSchema(toolbox, LockGranularity.TIME_CHUNK, new SegmentProvider(DATA_SOURCE, new CompactionIntervalSpec(COMPACTION_INTERVAL, null)), new PartitionConfigurationManager(TUNING_CONFIG), null, null, null, null, COORDINATOR_CLIENT, segmentCacheManagerFactory, RETRY_POLICY_FACTORY, IOConfig.DEFAULT_DROP_EXISTING);
    final List<DimensionsSpec> expectedDimensionsSpec = getExpectedDimensionsSpecForAutoGeneration();
    ingestionSpecs.sort((s1, s2) -> Comparators.intervalsByStartThenEnd().compare(s1.getDataSchema().getGranularitySpec().inputIntervals().get(0), s2.getDataSchema().getGranularitySpec().inputIntervals().get(0)));
    Assert.assertEquals(6, ingestionSpecs.size());
    assertIngestionSchema(ingestionSpecs, expectedDimensionsSpec, AGGREGATORS.stream().map(AggregatorFactory::getCombiningFactory).collect(Collectors.toList()), SEGMENT_INTERVALS, Granularities.MONTH, Granularities.NONE, IOConfig.DEFAULT_DROP_EXISTING);
}
Also used : PartitionConfigurationManager(org.apache.druid.indexing.common.task.CompactionTask.PartitionConfigurationManager) ParallelIndexIngestionSpec(org.apache.druid.indexing.common.task.batch.parallel.ParallelIndexIngestionSpec) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) SegmentProvider(org.apache.druid.indexing.common.task.CompactionTask.SegmentProvider) DoubleLastAggregatorFactory(org.apache.druid.query.aggregation.last.DoubleLastAggregatorFactory) FloatMinAggregatorFactory(org.apache.druid.query.aggregation.FloatMinAggregatorFactory) FloatFirstAggregatorFactory(org.apache.druid.query.aggregation.first.FloatFirstAggregatorFactory) LongSumAggregatorFactory(org.apache.druid.query.aggregation.LongSumAggregatorFactory) AggregatorFactory(org.apache.druid.query.aggregation.AggregatorFactory) CountAggregatorFactory(org.apache.druid.query.aggregation.CountAggregatorFactory) DoubleMaxAggregatorFactory(org.apache.druid.query.aggregation.DoubleMaxAggregatorFactory) LongMaxAggregatorFactory(org.apache.druid.query.aggregation.LongMaxAggregatorFactory) Test(org.junit.Test)

Example 4 with ParallelIndexIngestionSpec

use of org.apache.druid.indexing.common.task.batch.parallel.ParallelIndexIngestionSpec in project druid by druid-io.

the class CompactionTaskTest method testCreateIngestionSchemaWithCustomSegments.

@Test
public void testCreateIngestionSchemaWithCustomSegments() throws IOException, SegmentLoadingException {
    final List<ParallelIndexIngestionSpec> ingestionSpecs = CompactionTask.createIngestionSchema(toolbox, LockGranularity.TIME_CHUNK, new SegmentProvider(DATA_SOURCE, SpecificSegmentsSpec.fromSegments(SEGMENTS)), new PartitionConfigurationManager(TUNING_CONFIG), null, null, null, null, COORDINATOR_CLIENT, segmentCacheManagerFactory, RETRY_POLICY_FACTORY, IOConfig.DEFAULT_DROP_EXISTING);
    final List<DimensionsSpec> expectedDimensionsSpec = getExpectedDimensionsSpecForAutoGeneration();
    ingestionSpecs.sort((s1, s2) -> Comparators.intervalsByStartThenEnd().compare(s1.getDataSchema().getGranularitySpec().inputIntervals().get(0), s2.getDataSchema().getGranularitySpec().inputIntervals().get(0)));
    Assert.assertEquals(6, ingestionSpecs.size());
    assertIngestionSchema(ingestionSpecs, expectedDimensionsSpec, AGGREGATORS.stream().map(AggregatorFactory::getCombiningFactory).collect(Collectors.toList()), SEGMENT_INTERVALS, Granularities.MONTH, Granularities.NONE, IOConfig.DEFAULT_DROP_EXISTING);
}
Also used : PartitionConfigurationManager(org.apache.druid.indexing.common.task.CompactionTask.PartitionConfigurationManager) ParallelIndexIngestionSpec(org.apache.druid.indexing.common.task.batch.parallel.ParallelIndexIngestionSpec) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) SegmentProvider(org.apache.druid.indexing.common.task.CompactionTask.SegmentProvider) DoubleLastAggregatorFactory(org.apache.druid.query.aggregation.last.DoubleLastAggregatorFactory) FloatMinAggregatorFactory(org.apache.druid.query.aggregation.FloatMinAggregatorFactory) FloatFirstAggregatorFactory(org.apache.druid.query.aggregation.first.FloatFirstAggregatorFactory) LongSumAggregatorFactory(org.apache.druid.query.aggregation.LongSumAggregatorFactory) AggregatorFactory(org.apache.druid.query.aggregation.AggregatorFactory) CountAggregatorFactory(org.apache.druid.query.aggregation.CountAggregatorFactory) DoubleMaxAggregatorFactory(org.apache.druid.query.aggregation.DoubleMaxAggregatorFactory) LongMaxAggregatorFactory(org.apache.druid.query.aggregation.LongMaxAggregatorFactory) Test(org.junit.Test)

Example 5 with ParallelIndexIngestionSpec

use of org.apache.druid.indexing.common.task.batch.parallel.ParallelIndexIngestionSpec in project druid by druid-io.

the class CompactionTaskTest method testCreateIngestionSchemaWithTargetPartitionSize.

@Test
public void testCreateIngestionSchemaWithTargetPartitionSize() throws IOException, SegmentLoadingException {
    final CompactionTask.CompactionTuningConfig tuningConfig = new CompactionTask.CompactionTuningConfig(100000, null, null, 500000, 1000000L, null, null, null, null, null, new IndexSpec(new RoaringBitmapSerdeFactory(true), CompressionStrategy.LZ4, CompressionStrategy.LZF, LongEncodingStrategy.LONGS), null, null, true, false, null, null, null, 10, null, null, null, null, null, null, null, null, null, null, null);
    final List<ParallelIndexIngestionSpec> ingestionSpecs = CompactionTask.createIngestionSchema(toolbox, LockGranularity.TIME_CHUNK, new SegmentProvider(DATA_SOURCE, new CompactionIntervalSpec(COMPACTION_INTERVAL, null)), new PartitionConfigurationManager(tuningConfig), null, null, null, null, COORDINATOR_CLIENT, segmentCacheManagerFactory, RETRY_POLICY_FACTORY, IOConfig.DEFAULT_DROP_EXISTING);
    final List<DimensionsSpec> expectedDimensionsSpec = getExpectedDimensionsSpecForAutoGeneration();
    ingestionSpecs.sort((s1, s2) -> Comparators.intervalsByStartThenEnd().compare(s1.getDataSchema().getGranularitySpec().inputIntervals().get(0), s2.getDataSchema().getGranularitySpec().inputIntervals().get(0)));
    Assert.assertEquals(6, ingestionSpecs.size());
    assertIngestionSchema(ingestionSpecs, expectedDimensionsSpec, AGGREGATORS.stream().map(AggregatorFactory::getCombiningFactory).collect(Collectors.toList()), SEGMENT_INTERVALS, tuningConfig, Granularities.MONTH, Granularities.NONE, IOConfig.DEFAULT_DROP_EXISTING);
}
Also used : IndexSpec(org.apache.druid.segment.IndexSpec) ParallelIndexIngestionSpec(org.apache.druid.indexing.common.task.batch.parallel.ParallelIndexIngestionSpec) SegmentProvider(org.apache.druid.indexing.common.task.CompactionTask.SegmentProvider) DoubleLastAggregatorFactory(org.apache.druid.query.aggregation.last.DoubleLastAggregatorFactory) FloatMinAggregatorFactory(org.apache.druid.query.aggregation.FloatMinAggregatorFactory) FloatFirstAggregatorFactory(org.apache.druid.query.aggregation.first.FloatFirstAggregatorFactory) LongSumAggregatorFactory(org.apache.druid.query.aggregation.LongSumAggregatorFactory) AggregatorFactory(org.apache.druid.query.aggregation.AggregatorFactory) CountAggregatorFactory(org.apache.druid.query.aggregation.CountAggregatorFactory) DoubleMaxAggregatorFactory(org.apache.druid.query.aggregation.DoubleMaxAggregatorFactory) LongMaxAggregatorFactory(org.apache.druid.query.aggregation.LongMaxAggregatorFactory) RoaringBitmapSerdeFactory(org.apache.druid.segment.data.RoaringBitmapSerdeFactory) PartitionConfigurationManager(org.apache.druid.indexing.common.task.CompactionTask.PartitionConfigurationManager) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) Test(org.junit.Test)

Aggregations

ParallelIndexIngestionSpec (org.apache.druid.indexing.common.task.batch.parallel.ParallelIndexIngestionSpec)18 DimensionsSpec (org.apache.druid.data.input.impl.DimensionsSpec)15 PartitionConfigurationManager (org.apache.druid.indexing.common.task.CompactionTask.PartitionConfigurationManager)14 SegmentProvider (org.apache.druid.indexing.common.task.CompactionTask.SegmentProvider)14 Test (org.junit.Test)14 AggregatorFactory (org.apache.druid.query.aggregation.AggregatorFactory)13 LongSumAggregatorFactory (org.apache.druid.query.aggregation.LongSumAggregatorFactory)13 CountAggregatorFactory (org.apache.druid.query.aggregation.CountAggregatorFactory)12 DoubleMaxAggregatorFactory (org.apache.druid.query.aggregation.DoubleMaxAggregatorFactory)12 FloatMinAggregatorFactory (org.apache.druid.query.aggregation.FloatMinAggregatorFactory)12 LongMaxAggregatorFactory (org.apache.druid.query.aggregation.LongMaxAggregatorFactory)12 FloatFirstAggregatorFactory (org.apache.druid.query.aggregation.first.FloatFirstAggregatorFactory)12 DoubleLastAggregatorFactory (org.apache.druid.query.aggregation.last.DoubleLastAggregatorFactory)12 ClientCompactionTaskGranularitySpec (org.apache.druid.client.indexing.ClientCompactionTaskGranularitySpec)8 DoubleDimensionSchema (org.apache.druid.data.input.impl.DoubleDimensionSchema)4 IndexSpec (org.apache.druid.segment.IndexSpec)4 DataSchema (org.apache.druid.segment.indexing.DataSchema)4 ArrayList (java.util.ArrayList)3 TimestampSpec (org.apache.druid.data.input.impl.TimestampSpec)3 ParallelIndexIOConfig (org.apache.druid.indexing.common.task.batch.parallel.ParallelIndexIOConfig)3