Search in sources :

Example 36 with HashedPartitionsSpec

use of org.apache.druid.indexer.partitions.HashedPartitionsSpec in project druid by druid-io.

the class CompactionTaskRunTest method testRunWithHashPartitioning.

@Test
public void testRunWithHashPartitioning() throws Exception {
    // Hash partitioning is not supported with segment lock yet
    if (lockGranularity == LockGranularity.SEGMENT) {
        return;
    }
    runIndexTask();
    final Builder builder = new Builder(DATA_SOURCE, segmentCacheManagerFactory, RETRY_POLICY_FACTORY);
    final CompactionTask compactionTask = builder.interval(Intervals.of("2014-01-01/2014-01-02")).tuningConfig(new ParallelIndexTuningConfig(null, null, null, null, null, null, null, null, null, new HashedPartitionsSpec(null, 3, null), null, null, null, true, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null)).build();
    final Pair<TaskStatus, List<DataSegment>> resultPair = runTask(compactionTask);
    Assert.assertTrue(resultPair.lhs.isSuccess());
    final List<DataSegment> segments = resultPair.rhs;
    Assert.assertEquals(6, segments.size());
    for (int i = 0; i < 3; i++) {
        final Interval interval = Intervals.of("2014-01-01T0%d:00:00/2014-01-01T0%d:00:00", i, i + 1);
        for (int j = 0; j < 2; j++) {
            final int segmentIdx = i * 2 + j;
            Assert.assertEquals(interval, segments.get(segmentIdx).getInterval());
            Map<String, String> expectedLongSumMetric = new HashMap<>();
            expectedLongSumMetric.put("type", "longSum");
            expectedLongSumMetric.put("name", "val");
            expectedLongSumMetric.put("fieldName", "val");
            expectedLongSumMetric.put("expression", null);
            CompactionState expectedState = new CompactionState(new HashedPartitionsSpec(null, 3, null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("ts", "dim"))), ImmutableList.of(expectedLongSumMetric), null, compactionTask.getTuningConfig().getIndexSpec().asMap(getObjectMapper()), getObjectMapper().readValue(getObjectMapper().writeValueAsString(new UniformGranularitySpec(Granularities.HOUR, Granularities.MINUTE, true, ImmutableList.of(Intervals.of("2014-01-01T0%d:00:00/2014-01-01T0%d:00:00", i, i + 1)))), Map.class));
            Assert.assertEquals(expectedState, segments.get(segmentIdx).getLastCompactionState());
            Assert.assertSame(HashBasedNumberedShardSpec.class, segments.get(segmentIdx).getShardSpec().getClass());
        }
    }
    List<String> rowsFromSegment = getCSVFormatRowsFromSegments(segments);
    rowsFromSegment.sort(Ordering.natural());
    Assert.assertEquals(TEST_ROWS, rowsFromSegment);
}
Also used : HashedPartitionsSpec(org.apache.druid.indexer.partitions.HashedPartitionsSpec) HashMap(java.util.HashMap) Builder(org.apache.druid.indexing.common.task.CompactionTask.Builder) TaskStatus(org.apache.druid.indexer.TaskStatus) DataSegment(org.apache.druid.timeline.DataSegment) UniformGranularitySpec(org.apache.druid.segment.indexing.granularity.UniformGranularitySpec) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) ArrayList(java.util.ArrayList) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) CompactionState(org.apache.druid.timeline.CompactionState) ParallelIndexTuningConfig(org.apache.druid.indexing.common.task.batch.parallel.ParallelIndexTuningConfig) Map(java.util.Map) HashMap(java.util.HashMap) Interval(org.joda.time.Interval) Test(org.junit.Test)

Example 37 with HashedPartitionsSpec

use of org.apache.druid.indexer.partitions.HashedPartitionsSpec in project druid by druid-io.

the class IndexTaskSerdeTest method testSerdeTuningConfigWithHashedPartitionsSpec.

@Test
public void testSerdeTuningConfigWithHashedPartitionsSpec() throws IOException {
    final IndexTuningConfig tuningConfig = new IndexTuningConfig(null, null, null, 100, 2000L, null, null, null, null, null, new HashedPartitionsSpec(null, 10, ImmutableList.of("dim1", "dim2")), new IndexSpec(new RoaringBitmapSerdeFactory(false), CompressionStrategy.LZ4, CompressionStrategy.LZF, LongEncodingStrategy.LONGS), null, null, true, null, null, 100L, OffHeapMemorySegmentWriteOutMediumFactory.instance(), true, 10, 100, null, -1L);
    assertSerdeTuningConfig(tuningConfig);
}
Also used : HashedPartitionsSpec(org.apache.druid.indexer.partitions.HashedPartitionsSpec) IndexSpec(org.apache.druid.segment.IndexSpec) RoaringBitmapSerdeFactory(org.apache.druid.segment.data.RoaringBitmapSerdeFactory) IndexTuningConfig(org.apache.druid.indexing.common.task.IndexTask.IndexTuningConfig) Test(org.junit.Test)

Example 38 with HashedPartitionsSpec

use of org.apache.druid.indexer.partitions.HashedPartitionsSpec in project druid by druid-io.

the class CompactionTaskTest method testCreateIngestionSchemaWithNumShards.

@Test
public void testCreateIngestionSchemaWithNumShards() throws IOException, SegmentLoadingException {
    final CompactionTask.CompactionTuningConfig tuningConfig = new CompactionTask.CompactionTuningConfig(null, null, null, 500000, 1000000L, null, null, null, null, new HashedPartitionsSpec(null, 3, null), new IndexSpec(new RoaringBitmapSerdeFactory(true), CompressionStrategy.LZ4, CompressionStrategy.LZF, LongEncodingStrategy.LONGS), null, null, true, false, 5000L, null, null, 10, null, null, null, null, null, null, null, null, null, null, null);
    final List<ParallelIndexIngestionSpec> ingestionSpecs = CompactionTask.createIngestionSchema(toolbox, LockGranularity.TIME_CHUNK, new SegmentProvider(DATA_SOURCE, new CompactionIntervalSpec(COMPACTION_INTERVAL, null)), new PartitionConfigurationManager(tuningConfig), null, null, null, null, COORDINATOR_CLIENT, segmentCacheManagerFactory, RETRY_POLICY_FACTORY, IOConfig.DEFAULT_DROP_EXISTING);
    final List<DimensionsSpec> expectedDimensionsSpec = getExpectedDimensionsSpecForAutoGeneration();
    ingestionSpecs.sort((s1, s2) -> Comparators.intervalsByStartThenEnd().compare(s1.getDataSchema().getGranularitySpec().inputIntervals().get(0), s2.getDataSchema().getGranularitySpec().inputIntervals().get(0)));
    Assert.assertEquals(6, ingestionSpecs.size());
    assertIngestionSchema(ingestionSpecs, expectedDimensionsSpec, AGGREGATORS.stream().map(AggregatorFactory::getCombiningFactory).collect(Collectors.toList()), SEGMENT_INTERVALS, tuningConfig, Granularities.MONTH, Granularities.NONE, IOConfig.DEFAULT_DROP_EXISTING);
}
Also used : HashedPartitionsSpec(org.apache.druid.indexer.partitions.HashedPartitionsSpec) IndexSpec(org.apache.druid.segment.IndexSpec) ParallelIndexIngestionSpec(org.apache.druid.indexing.common.task.batch.parallel.ParallelIndexIngestionSpec) SegmentProvider(org.apache.druid.indexing.common.task.CompactionTask.SegmentProvider) DoubleLastAggregatorFactory(org.apache.druid.query.aggregation.last.DoubleLastAggregatorFactory) FloatMinAggregatorFactory(org.apache.druid.query.aggregation.FloatMinAggregatorFactory) FloatFirstAggregatorFactory(org.apache.druid.query.aggregation.first.FloatFirstAggregatorFactory) LongSumAggregatorFactory(org.apache.druid.query.aggregation.LongSumAggregatorFactory) AggregatorFactory(org.apache.druid.query.aggregation.AggregatorFactory) CountAggregatorFactory(org.apache.druid.query.aggregation.CountAggregatorFactory) DoubleMaxAggregatorFactory(org.apache.druid.query.aggregation.DoubleMaxAggregatorFactory) LongMaxAggregatorFactory(org.apache.druid.query.aggregation.LongMaxAggregatorFactory) RoaringBitmapSerdeFactory(org.apache.druid.segment.data.RoaringBitmapSerdeFactory) PartitionConfigurationManager(org.apache.druid.indexing.common.task.CompactionTask.PartitionConfigurationManager) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) Test(org.junit.Test)

Example 39 with HashedPartitionsSpec

use of org.apache.druid.indexer.partitions.HashedPartitionsSpec in project druid by druid-io.

the class DetermineHashedPartitionsJob method run.

@Override
public boolean run() {
    try {
        /*
       * Group by (timestamp, dimensions) so we can correctly count dimension values as they would appear
       * in the final segment.
       */
        startTime = System.currentTimeMillis();
        groupByJob = Job.getInstance(new Configuration(), StringUtils.format("%s-determine_partitions_hashed-%s", config.getDataSource(), config.getIntervals()));
        JobHelper.injectSystemProperties(groupByJob.getConfiguration(), config);
        config.addJobProperties(groupByJob);
        groupByJob.setMapperClass(DetermineCardinalityMapper.class);
        groupByJob.setMapOutputKeyClass(LongWritable.class);
        groupByJob.setMapOutputValueClass(BytesWritable.class);
        groupByJob.setReducerClass(DetermineCardinalityReducer.class);
        groupByJob.setOutputKeyClass(NullWritable.class);
        groupByJob.setOutputValueClass(NullWritable.class);
        groupByJob.setOutputFormatClass(SequenceFileOutputFormat.class);
        groupByJob.setPartitionerClass(DetermineHashedPartitionsPartitioner.class);
        if (config.getInputIntervals().isEmpty()) {
            groupByJob.setNumReduceTasks(1);
        } else {
            groupByJob.setNumReduceTasks(Iterators.size(config.getSegmentGranularIntervals().iterator()));
        }
        JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()), JobHelper.distributedClassPath(config.makeIntermediatePath()), groupByJob);
        config.addInputPaths(groupByJob);
        config.intoConfiguration(groupByJob);
        FileOutputFormat.setOutputPath(groupByJob, config.makeGroupedDataDir());
        groupByJob.submit();
        log.info("Job %s submitted, status available at: %s", groupByJob.getJobName(), groupByJob.getTrackingURL());
        // Store the jobId in the file
        if (groupByJob.getJobID() != null) {
            JobHelper.writeJobIdToFile(config.getHadoopJobIdFileName(), groupByJob.getJobID().toString());
        }
        try {
            if (!groupByJob.waitForCompletion(true)) {
                log.error("Job failed: %s", groupByJob.getJobID());
                failureCause = Utils.getFailureMessage(groupByJob, HadoopDruidIndexerConfig.JSON_MAPPER);
                return false;
            }
        } catch (IOException ioe) {
            if (!Utils.checkAppSuccessForJobIOException(ioe, groupByJob, config.isUseYarnRMJobStatusFallback())) {
                throw ioe;
            }
        }
        /*
       * Load partitions and intervals determined by the previous job.
       */
        log.info("Job completed, loading up partitions for intervals[%s].", config.getSegmentGranularIntervals());
        FileSystem fileSystem = null;
        if (config.getInputIntervals().isEmpty()) {
            final Path intervalInfoPath = config.makeIntervalInfoPath();
            fileSystem = intervalInfoPath.getFileSystem(groupByJob.getConfiguration());
            if (!Utils.exists(groupByJob, fileSystem, intervalInfoPath)) {
                throw new ISE("Path[%s] didn't exist!?", intervalInfoPath);
            }
            List<Interval> intervals = HadoopDruidIndexerConfig.JSON_MAPPER.readValue(Utils.openInputStream(groupByJob, intervalInfoPath), new TypeReference<List<Interval>>() {
            });
            config.setGranularitySpec(new UniformGranularitySpec(config.getGranularitySpec().getSegmentGranularity(), config.getGranularitySpec().getQueryGranularity(), config.getGranularitySpec().isRollup(), intervals));
            log.info("Determined Intervals for Job [%s].", config.getSegmentGranularIntervals());
        }
        Map<Long, List<HadoopyShardSpec>> shardSpecs = new TreeMap<>(DateTimeComparator.getInstance());
        PartitionsSpec partitionsSpec = config.getPartitionsSpec();
        if (!(partitionsSpec instanceof HashedPartitionsSpec)) {
            throw new ISE("%s is expected, but got %s", HashedPartitionsSpec.class.getName(), partitionsSpec.getClass().getName());
        }
        HashPartitionFunction partitionFunction = ((HashedPartitionsSpec) partitionsSpec).getPartitionFunction();
        int shardCount = 0;
        for (Interval segmentGranularity : config.getSegmentGranularIntervals()) {
            DateTime bucket = segmentGranularity.getStart();
            final Path partitionInfoPath = config.makeSegmentPartitionInfoPath(segmentGranularity);
            if (fileSystem == null) {
                fileSystem = partitionInfoPath.getFileSystem(groupByJob.getConfiguration());
            }
            if (Utils.exists(groupByJob, fileSystem, partitionInfoPath)) {
                final Long numRows = HadoopDruidIndexerConfig.JSON_MAPPER.readValue(Utils.openInputStream(groupByJob, partitionInfoPath), Long.class);
                log.info("Found approximately [%,d] rows in data.", numRows);
                final int numberOfShards = (int) Math.ceil((double) numRows / config.getTargetPartitionSize());
                log.info("Creating [%,d] shards", numberOfShards);
                List<HadoopyShardSpec> actualSpecs = Lists.newArrayListWithExpectedSize(numberOfShards);
                for (int i = 0; i < numberOfShards; ++i) {
                    actualSpecs.add(new HadoopyShardSpec(new HashBasedNumberedShardSpec(i, numberOfShards, i, numberOfShards, null, partitionFunction, HadoopDruidIndexerConfig.JSON_MAPPER), shardCount++));
                    log.info("DateTime[%s], partition[%d], spec[%s]", bucket, i, actualSpecs.get(i));
                }
                shardSpecs.put(bucket.getMillis(), actualSpecs);
            } else {
                log.info("Path[%s] didn't exist!?", partitionInfoPath);
            }
        }
        config.setShardSpecs(shardSpecs);
        log.info("DetermineHashedPartitionsJob took %d millis", (System.currentTimeMillis() - startTime));
        return true;
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) HashBasedNumberedShardSpec(org.apache.druid.timeline.partition.HashBasedNumberedShardSpec) HashedPartitionsSpec(org.apache.druid.indexer.partitions.HashedPartitionsSpec) Configuration(org.apache.hadoop.conf.Configuration) IOException(java.io.IOException) TreeMap(java.util.TreeMap) DateTime(org.joda.time.DateTime) IOException(java.io.IOException) UniformGranularitySpec(org.apache.druid.segment.indexing.granularity.UniformGranularitySpec) HashPartitionFunction(org.apache.druid.timeline.partition.HashPartitionFunction) PartitionsSpec(org.apache.druid.indexer.partitions.PartitionsSpec) HashedPartitionsSpec(org.apache.druid.indexer.partitions.HashedPartitionsSpec) FileSystem(org.apache.hadoop.fs.FileSystem) ISE(org.apache.druid.java.util.common.ISE) ArrayList(java.util.ArrayList) List(java.util.List) Interval(org.joda.time.Interval)

Example 40 with HashedPartitionsSpec

use of org.apache.druid.indexer.partitions.HashedPartitionsSpec in project druid by druid-io.

the class HadoopDruidDetermineConfigurationJob method run.

@Override
public boolean run() {
    JobHelper.ensurePaths(config);
    if (config.isDeterminingPartitions()) {
        job = createPartitionJob(config);
        config.setHadoopJobIdFileName(hadoopJobIdFile);
        boolean jobSucceeded = JobHelper.runSingleJob(job);
        JobHelper.maybeDeleteIntermediatePath(jobSucceeded, config.getSchema());
        return jobSucceeded;
    } else {
        final PartitionsSpec partitionsSpec = config.getPartitionsSpec();
        final int shardsPerInterval;
        final HashPartitionFunction partitionFunction;
        if (partitionsSpec instanceof HashedPartitionsSpec) {
            final HashedPartitionsSpec hashedPartitionsSpec = (HashedPartitionsSpec) partitionsSpec;
            shardsPerInterval = PartitionsSpec.isEffectivelyNull(hashedPartitionsSpec.getNumShards()) ? 1 : hashedPartitionsSpec.getNumShards();
            partitionFunction = hashedPartitionsSpec.getPartitionFunction();
        } else {
            shardsPerInterval = 1;
            partitionFunction = null;
        }
        Map<Long, List<HadoopyShardSpec>> shardSpecs = new TreeMap<>();
        int shardCount = 0;
        for (Interval segmentGranularity : config.getSegmentGranularIntervals()) {
            DateTime bucket = segmentGranularity.getStart();
            // negative shardsPerInterval means a single shard
            List<HadoopyShardSpec> specs = Lists.newArrayListWithCapacity(shardsPerInterval);
            for (int i = 0; i < shardsPerInterval; i++) {
                specs.add(new HadoopyShardSpec(new HashBasedNumberedShardSpec(i, shardsPerInterval, i, shardsPerInterval, config.getPartitionsSpec().getPartitionDimensions(), partitionFunction, HadoopDruidIndexerConfig.JSON_MAPPER), shardCount++));
            }
            shardSpecs.put(bucket.getMillis(), specs);
            log.info("DateTime[%s], spec[%s]", bucket, specs);
        }
        config.setShardSpecs(shardSpecs);
        return true;
    }
}
Also used : HashBasedNumberedShardSpec(org.apache.druid.timeline.partition.HashBasedNumberedShardSpec) HashedPartitionsSpec(org.apache.druid.indexer.partitions.HashedPartitionsSpec) TreeMap(java.util.TreeMap) DateTime(org.joda.time.DateTime) HashPartitionFunction(org.apache.druid.timeline.partition.HashPartitionFunction) HashedPartitionsSpec(org.apache.druid.indexer.partitions.HashedPartitionsSpec) SingleDimensionPartitionsSpec(org.apache.druid.indexer.partitions.SingleDimensionPartitionsSpec) PartitionsSpec(org.apache.druid.indexer.partitions.PartitionsSpec) List(java.util.List) Interval(org.joda.time.Interval)

Aggregations

HashedPartitionsSpec (org.apache.druid.indexer.partitions.HashedPartitionsSpec)43 Test (org.junit.Test)31 Interval (org.joda.time.Interval)20 DataSegment (org.apache.druid.timeline.DataSegment)15 List (java.util.List)14 ImmutableList (com.google.common.collect.ImmutableList)12 PartitionsSpec (org.apache.druid.indexer.partitions.PartitionsSpec)12 Map (java.util.Map)11 SingleDimensionPartitionsSpec (org.apache.druid.indexer.partitions.SingleDimensionPartitionsSpec)11 HashBasedNumberedShardSpec (org.apache.druid.timeline.partition.HashBasedNumberedShardSpec)11 ArrayList (java.util.ArrayList)10 DimensionsSpec (org.apache.druid.data.input.impl.DimensionsSpec)9 StringUtils (org.apache.druid.java.util.common.StringUtils)9 File (java.io.File)8 HashMap (java.util.HashMap)8 DynamicPartitionsSpec (org.apache.druid.indexer.partitions.DynamicPartitionsSpec)8 UniformGranularitySpec (org.apache.druid.segment.indexing.granularity.UniformGranularitySpec)8 HashPartitionFunction (org.apache.druid.timeline.partition.HashPartitionFunction)8 ImmutableMap (com.google.common.collect.ImmutableMap)7 IOException (java.io.IOException)7