Search in sources :

Example 16 with HashBasedNumberedShardSpec

use of org.apache.druid.timeline.partition.HashBasedNumberedShardSpec in project druid by druid-io.

the class HadoopDruidIndexerConfigTest method testHashedBucketSelection.

@Test
public void testHashedBucketSelection() {
    List<HadoopyShardSpec> shardSpecs = new ArrayList<>();
    final int partitionCount = 10;
    for (int i = 0; i < partitionCount; i++) {
        shardSpecs.add(new HadoopyShardSpec(new HashBasedNumberedShardSpec(i, partitionCount, i, partitionCount, null, HashPartitionFunction.MURMUR3_32_ABS, new DefaultObjectMapper()), i));
    }
    HadoopIngestionSpec spec = new HadoopIngestionSpecBuilder().shardSpecs(ImmutableMap.of(DateTimes.of("2010-01-01T01:00:00").getMillis(), shardSpecs)).build();
    HadoopDruidIndexerConfig config = HadoopDruidIndexerConfig.fromSpec(spec);
    final List<String> dims = Arrays.asList("diM1", "dIM2");
    final ImmutableMap<String, Object> values = ImmutableMap.of("Dim1", "1", "DiM2", "2", "dim1", "3", "dim2", "4");
    final long timestamp = DateTimes.of("2010-01-01T01:00:01").getMillis();
    final Bucket expectedBucket = config.getBucket(new MapBasedInputRow(timestamp, dims, values)).get();
    final long nextBucketTimestamp = Granularities.MINUTE.bucketEnd(DateTimes.utc(timestamp)).getMillis();
    // check that all rows having same set of dims and truncated timestamp hash to same bucket
    for (int i = 0; timestamp + i < nextBucketTimestamp; i++) {
        Assert.assertEquals(expectedBucket.partitionNum, config.getBucket(new MapBasedInputRow(timestamp + i, dims, values)).get().partitionNum);
    }
}
Also used : HashBasedNumberedShardSpec(org.apache.druid.timeline.partition.HashBasedNumberedShardSpec) ArrayList(java.util.ArrayList) DefaultObjectMapper(org.apache.druid.jackson.DefaultObjectMapper) MapBasedInputRow(org.apache.druid.data.input.MapBasedInputRow) Test(org.junit.Test)

Example 17 with HashBasedNumberedShardSpec

use of org.apache.druid.timeline.partition.HashBasedNumberedShardSpec in project druid by druid-io.

the class DetermineHashedPartitionsJob method run.

@Override
public boolean run() {
    try {
        /*
       * Group by (timestamp, dimensions) so we can correctly count dimension values as they would appear
       * in the final segment.
       */
        startTime = System.currentTimeMillis();
        groupByJob = Job.getInstance(new Configuration(), StringUtils.format("%s-determine_partitions_hashed-%s", config.getDataSource(), config.getIntervals()));
        JobHelper.injectSystemProperties(groupByJob.getConfiguration(), config);
        config.addJobProperties(groupByJob);
        groupByJob.setMapperClass(DetermineCardinalityMapper.class);
        groupByJob.setMapOutputKeyClass(LongWritable.class);
        groupByJob.setMapOutputValueClass(BytesWritable.class);
        groupByJob.setReducerClass(DetermineCardinalityReducer.class);
        groupByJob.setOutputKeyClass(NullWritable.class);
        groupByJob.setOutputValueClass(NullWritable.class);
        groupByJob.setOutputFormatClass(SequenceFileOutputFormat.class);
        groupByJob.setPartitionerClass(DetermineHashedPartitionsPartitioner.class);
        if (config.getInputIntervals().isEmpty()) {
            groupByJob.setNumReduceTasks(1);
        } else {
            groupByJob.setNumReduceTasks(Iterators.size(config.getSegmentGranularIntervals().iterator()));
        }
        JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()), JobHelper.distributedClassPath(config.makeIntermediatePath()), groupByJob);
        config.addInputPaths(groupByJob);
        config.intoConfiguration(groupByJob);
        FileOutputFormat.setOutputPath(groupByJob, config.makeGroupedDataDir());
        groupByJob.submit();
        log.info("Job %s submitted, status available at: %s", groupByJob.getJobName(), groupByJob.getTrackingURL());
        // Store the jobId in the file
        if (groupByJob.getJobID() != null) {
            JobHelper.writeJobIdToFile(config.getHadoopJobIdFileName(), groupByJob.getJobID().toString());
        }
        try {
            if (!groupByJob.waitForCompletion(true)) {
                log.error("Job failed: %s", groupByJob.getJobID());
                failureCause = Utils.getFailureMessage(groupByJob, HadoopDruidIndexerConfig.JSON_MAPPER);
                return false;
            }
        } catch (IOException ioe) {
            if (!Utils.checkAppSuccessForJobIOException(ioe, groupByJob, config.isUseYarnRMJobStatusFallback())) {
                throw ioe;
            }
        }
        /*
       * Load partitions and intervals determined by the previous job.
       */
        log.info("Job completed, loading up partitions for intervals[%s].", config.getSegmentGranularIntervals());
        FileSystem fileSystem = null;
        if (config.getInputIntervals().isEmpty()) {
            final Path intervalInfoPath = config.makeIntervalInfoPath();
            fileSystem = intervalInfoPath.getFileSystem(groupByJob.getConfiguration());
            if (!Utils.exists(groupByJob, fileSystem, intervalInfoPath)) {
                throw new ISE("Path[%s] didn't exist!?", intervalInfoPath);
            }
            List<Interval> intervals = HadoopDruidIndexerConfig.JSON_MAPPER.readValue(Utils.openInputStream(groupByJob, intervalInfoPath), new TypeReference<List<Interval>>() {
            });
            config.setGranularitySpec(new UniformGranularitySpec(config.getGranularitySpec().getSegmentGranularity(), config.getGranularitySpec().getQueryGranularity(), config.getGranularitySpec().isRollup(), intervals));
            log.info("Determined Intervals for Job [%s].", config.getSegmentGranularIntervals());
        }
        Map<Long, List<HadoopyShardSpec>> shardSpecs = new TreeMap<>(DateTimeComparator.getInstance());
        PartitionsSpec partitionsSpec = config.getPartitionsSpec();
        if (!(partitionsSpec instanceof HashedPartitionsSpec)) {
            throw new ISE("%s is expected, but got %s", HashedPartitionsSpec.class.getName(), partitionsSpec.getClass().getName());
        }
        HashPartitionFunction partitionFunction = ((HashedPartitionsSpec) partitionsSpec).getPartitionFunction();
        int shardCount = 0;
        for (Interval segmentGranularity : config.getSegmentGranularIntervals()) {
            DateTime bucket = segmentGranularity.getStart();
            final Path partitionInfoPath = config.makeSegmentPartitionInfoPath(segmentGranularity);
            if (fileSystem == null) {
                fileSystem = partitionInfoPath.getFileSystem(groupByJob.getConfiguration());
            }
            if (Utils.exists(groupByJob, fileSystem, partitionInfoPath)) {
                final Long numRows = HadoopDruidIndexerConfig.JSON_MAPPER.readValue(Utils.openInputStream(groupByJob, partitionInfoPath), Long.class);
                log.info("Found approximately [%,d] rows in data.", numRows);
                final int numberOfShards = (int) Math.ceil((double) numRows / config.getTargetPartitionSize());
                log.info("Creating [%,d] shards", numberOfShards);
                List<HadoopyShardSpec> actualSpecs = Lists.newArrayListWithExpectedSize(numberOfShards);
                for (int i = 0; i < numberOfShards; ++i) {
                    actualSpecs.add(new HadoopyShardSpec(new HashBasedNumberedShardSpec(i, numberOfShards, i, numberOfShards, null, partitionFunction, HadoopDruidIndexerConfig.JSON_MAPPER), shardCount++));
                    log.info("DateTime[%s], partition[%d], spec[%s]", bucket, i, actualSpecs.get(i));
                }
                shardSpecs.put(bucket.getMillis(), actualSpecs);
            } else {
                log.info("Path[%s] didn't exist!?", partitionInfoPath);
            }
        }
        config.setShardSpecs(shardSpecs);
        log.info("DetermineHashedPartitionsJob took %d millis", (System.currentTimeMillis() - startTime));
        return true;
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) HashBasedNumberedShardSpec(org.apache.druid.timeline.partition.HashBasedNumberedShardSpec) HashedPartitionsSpec(org.apache.druid.indexer.partitions.HashedPartitionsSpec) Configuration(org.apache.hadoop.conf.Configuration) IOException(java.io.IOException) TreeMap(java.util.TreeMap) DateTime(org.joda.time.DateTime) IOException(java.io.IOException) UniformGranularitySpec(org.apache.druid.segment.indexing.granularity.UniformGranularitySpec) HashPartitionFunction(org.apache.druid.timeline.partition.HashPartitionFunction) PartitionsSpec(org.apache.druid.indexer.partitions.PartitionsSpec) HashedPartitionsSpec(org.apache.druid.indexer.partitions.HashedPartitionsSpec) FileSystem(org.apache.hadoop.fs.FileSystem) ISE(org.apache.druid.java.util.common.ISE) ArrayList(java.util.ArrayList) List(java.util.List) Interval(org.joda.time.Interval)

Example 18 with HashBasedNumberedShardSpec

use of org.apache.druid.timeline.partition.HashBasedNumberedShardSpec in project druid by druid-io.

the class HadoopDruidDetermineConfigurationJob method run.

@Override
public boolean run() {
    JobHelper.ensurePaths(config);
    if (config.isDeterminingPartitions()) {
        job = createPartitionJob(config);
        config.setHadoopJobIdFileName(hadoopJobIdFile);
        boolean jobSucceeded = JobHelper.runSingleJob(job);
        JobHelper.maybeDeleteIntermediatePath(jobSucceeded, config.getSchema());
        return jobSucceeded;
    } else {
        final PartitionsSpec partitionsSpec = config.getPartitionsSpec();
        final int shardsPerInterval;
        final HashPartitionFunction partitionFunction;
        if (partitionsSpec instanceof HashedPartitionsSpec) {
            final HashedPartitionsSpec hashedPartitionsSpec = (HashedPartitionsSpec) partitionsSpec;
            shardsPerInterval = PartitionsSpec.isEffectivelyNull(hashedPartitionsSpec.getNumShards()) ? 1 : hashedPartitionsSpec.getNumShards();
            partitionFunction = hashedPartitionsSpec.getPartitionFunction();
        } else {
            shardsPerInterval = 1;
            partitionFunction = null;
        }
        Map<Long, List<HadoopyShardSpec>> shardSpecs = new TreeMap<>();
        int shardCount = 0;
        for (Interval segmentGranularity : config.getSegmentGranularIntervals()) {
            DateTime bucket = segmentGranularity.getStart();
            // negative shardsPerInterval means a single shard
            List<HadoopyShardSpec> specs = Lists.newArrayListWithCapacity(shardsPerInterval);
            for (int i = 0; i < shardsPerInterval; i++) {
                specs.add(new HadoopyShardSpec(new HashBasedNumberedShardSpec(i, shardsPerInterval, i, shardsPerInterval, config.getPartitionsSpec().getPartitionDimensions(), partitionFunction, HadoopDruidIndexerConfig.JSON_MAPPER), shardCount++));
            }
            shardSpecs.put(bucket.getMillis(), specs);
            log.info("DateTime[%s], spec[%s]", bucket, specs);
        }
        config.setShardSpecs(shardSpecs);
        return true;
    }
}
Also used : HashBasedNumberedShardSpec(org.apache.druid.timeline.partition.HashBasedNumberedShardSpec) HashedPartitionsSpec(org.apache.druid.indexer.partitions.HashedPartitionsSpec) TreeMap(java.util.TreeMap) DateTime(org.joda.time.DateTime) HashPartitionFunction(org.apache.druid.timeline.partition.HashPartitionFunction) HashedPartitionsSpec(org.apache.druid.indexer.partitions.HashedPartitionsSpec) SingleDimensionPartitionsSpec(org.apache.druid.indexer.partitions.SingleDimensionPartitionsSpec) PartitionsSpec(org.apache.druid.indexer.partitions.PartitionsSpec) List(java.util.List) Interval(org.joda.time.Interval)

Example 19 with HashBasedNumberedShardSpec

use of org.apache.druid.timeline.partition.HashBasedNumberedShardSpec in project druid by druid-io.

the class DetermineHashedPartitionsJobTest method testDetermineHashedPartitions.

@Test
public void testDetermineHashedPartitions() {
    DetermineHashedPartitionsJob determineHashedPartitionsJob = new DetermineHashedPartitionsJob(indexerConfig);
    determineHashedPartitionsJob.run();
    HashPartitionFunction expectedFunction = ((HashedPartitionsSpec) indexerConfig.getPartitionsSpec()).getPartitionFunction();
    Map<Long, List<HadoopyShardSpec>> shardSpecs = indexerConfig.getSchema().getTuningConfig().getShardSpecs();
    Assert.assertEquals(expectedNumTimeBuckets, shardSpecs.entrySet().size());
    int i = 0;
    for (Map.Entry<Long, List<HadoopyShardSpec>> entry : shardSpecs.entrySet()) {
        Assert.assertEquals(expectedNumOfShards[i++], entry.getValue().size(), errorMargin);
        for (HadoopyShardSpec eachShardSpec : entry.getValue()) {
            final HashBasedNumberedShardSpec hashShardSpec = (HashBasedNumberedShardSpec) eachShardSpec.getActualSpec();
            Assert.assertEquals(expectedFunction, hashShardSpec.getPartitionFunction());
        }
    }
}
Also used : HashBasedNumberedShardSpec(org.apache.druid.timeline.partition.HashBasedNumberedShardSpec) HashedPartitionsSpec(org.apache.druid.indexer.partitions.HashedPartitionsSpec) HashPartitionFunction(org.apache.druid.timeline.partition.HashPartitionFunction) ImmutableList(com.google.common.collect.ImmutableList) List(java.util.List) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) Test(org.junit.Test)

Example 20 with HashBasedNumberedShardSpec

use of org.apache.druid.timeline.partition.HashBasedNumberedShardSpec in project druid by druid-io.

the class CachingClusteredClientTest method makeMockHashBasedSelector.

private ServerSelector makeMockHashBasedSelector(DruidServer server, List<String> partitionDimensions, @Nullable HashPartitionFunction partitionFunction, int partitionNum, int partitions) {
    final DataSegment segment = new DataSegment(SegmentId.dummy(DATA_SOURCE), null, null, null, new HashBasedNumberedShardSpec(partitionNum, partitions, partitionNum, partitions, partitionDimensions, partitionFunction, ServerTestHelper.MAPPER), null, 9, 0L);
    ServerSelector selector = new ServerSelector(segment, new HighestPriorityTierSelectorStrategy(new RandomServerSelectorStrategy()));
    selector.addServerAndUpdateSegment(new QueryableDruidServer(server, null), segment);
    return selector;
}
Also used : HashBasedNumberedShardSpec(org.apache.druid.timeline.partition.HashBasedNumberedShardSpec) ServerSelector(org.apache.druid.client.selector.ServerSelector) HighestPriorityTierSelectorStrategy(org.apache.druid.client.selector.HighestPriorityTierSelectorStrategy) DataSegment(org.apache.druid.timeline.DataSegment) RandomServerSelectorStrategy(org.apache.druid.client.selector.RandomServerSelectorStrategy) QueryableDruidServer(org.apache.druid.client.selector.QueryableDruidServer)

Aggregations

HashBasedNumberedShardSpec (org.apache.druid.timeline.partition.HashBasedNumberedShardSpec)20 Test (org.junit.Test)15 DataSegment (org.apache.druid.timeline.DataSegment)12 Interval (org.joda.time.Interval)12 List (java.util.List)11 ImmutableList (com.google.common.collect.ImmutableList)9 Map (java.util.Map)9 HashedPartitionsSpec (org.apache.druid.indexer.partitions.HashedPartitionsSpec)8 ArrayList (java.util.ArrayList)7 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)6 ImmutableMap (com.google.common.collect.ImmutableMap)6 File (java.io.File)6 HashMap (java.util.HashMap)6 HashPartitionFunction (org.apache.druid.timeline.partition.HashPartitionFunction)6 IOException (java.io.IOException)5 DimensionsSpec (org.apache.druid.data.input.impl.DimensionsSpec)5 TimestampSpec (org.apache.druid.data.input.impl.TimestampSpec)5 Intervals (org.apache.druid.java.util.common.Intervals)4 StringUtils (org.apache.druid.java.util.common.StringUtils)4 DataSchema (org.apache.druid.segment.indexing.DataSchema)4