Search in sources :

Example 1 with HashPartitionFunction

use of org.apache.druid.timeline.partition.HashPartitionFunction in project druid by druid-io.

the class DetermineHashedPartitionsJob method run.

@Override
public boolean run() {
    try {
        /*
       * Group by (timestamp, dimensions) so we can correctly count dimension values as they would appear
       * in the final segment.
       */
        startTime = System.currentTimeMillis();
        groupByJob = Job.getInstance(new Configuration(), StringUtils.format("%s-determine_partitions_hashed-%s", config.getDataSource(), config.getIntervals()));
        JobHelper.injectSystemProperties(groupByJob.getConfiguration(), config);
        config.addJobProperties(groupByJob);
        groupByJob.setMapperClass(DetermineCardinalityMapper.class);
        groupByJob.setMapOutputKeyClass(LongWritable.class);
        groupByJob.setMapOutputValueClass(BytesWritable.class);
        groupByJob.setReducerClass(DetermineCardinalityReducer.class);
        groupByJob.setOutputKeyClass(NullWritable.class);
        groupByJob.setOutputValueClass(NullWritable.class);
        groupByJob.setOutputFormatClass(SequenceFileOutputFormat.class);
        groupByJob.setPartitionerClass(DetermineHashedPartitionsPartitioner.class);
        if (config.getInputIntervals().isEmpty()) {
            groupByJob.setNumReduceTasks(1);
        } else {
            groupByJob.setNumReduceTasks(Iterators.size(config.getSegmentGranularIntervals().iterator()));
        }
        JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()), JobHelper.distributedClassPath(config.makeIntermediatePath()), groupByJob);
        config.addInputPaths(groupByJob);
        config.intoConfiguration(groupByJob);
        FileOutputFormat.setOutputPath(groupByJob, config.makeGroupedDataDir());
        groupByJob.submit();
        log.info("Job %s submitted, status available at: %s", groupByJob.getJobName(), groupByJob.getTrackingURL());
        // Store the jobId in the file
        if (groupByJob.getJobID() != null) {
            JobHelper.writeJobIdToFile(config.getHadoopJobIdFileName(), groupByJob.getJobID().toString());
        }
        try {
            if (!groupByJob.waitForCompletion(true)) {
                log.error("Job failed: %s", groupByJob.getJobID());
                failureCause = Utils.getFailureMessage(groupByJob, HadoopDruidIndexerConfig.JSON_MAPPER);
                return false;
            }
        } catch (IOException ioe) {
            if (!Utils.checkAppSuccessForJobIOException(ioe, groupByJob, config.isUseYarnRMJobStatusFallback())) {
                throw ioe;
            }
        }
        /*
       * Load partitions and intervals determined by the previous job.
       */
        log.info("Job completed, loading up partitions for intervals[%s].", config.getSegmentGranularIntervals());
        FileSystem fileSystem = null;
        if (config.getInputIntervals().isEmpty()) {
            final Path intervalInfoPath = config.makeIntervalInfoPath();
            fileSystem = intervalInfoPath.getFileSystem(groupByJob.getConfiguration());
            if (!Utils.exists(groupByJob, fileSystem, intervalInfoPath)) {
                throw new ISE("Path[%s] didn't exist!?", intervalInfoPath);
            }
            List<Interval> intervals = HadoopDruidIndexerConfig.JSON_MAPPER.readValue(Utils.openInputStream(groupByJob, intervalInfoPath), new TypeReference<List<Interval>>() {
            });
            config.setGranularitySpec(new UniformGranularitySpec(config.getGranularitySpec().getSegmentGranularity(), config.getGranularitySpec().getQueryGranularity(), config.getGranularitySpec().isRollup(), intervals));
            log.info("Determined Intervals for Job [%s].", config.getSegmentGranularIntervals());
        }
        Map<Long, List<HadoopyShardSpec>> shardSpecs = new TreeMap<>(DateTimeComparator.getInstance());
        PartitionsSpec partitionsSpec = config.getPartitionsSpec();
        if (!(partitionsSpec instanceof HashedPartitionsSpec)) {
            throw new ISE("%s is expected, but got %s", HashedPartitionsSpec.class.getName(), partitionsSpec.getClass().getName());
        }
        HashPartitionFunction partitionFunction = ((HashedPartitionsSpec) partitionsSpec).getPartitionFunction();
        int shardCount = 0;
        for (Interval segmentGranularity : config.getSegmentGranularIntervals()) {
            DateTime bucket = segmentGranularity.getStart();
            final Path partitionInfoPath = config.makeSegmentPartitionInfoPath(segmentGranularity);
            if (fileSystem == null) {
                fileSystem = partitionInfoPath.getFileSystem(groupByJob.getConfiguration());
            }
            if (Utils.exists(groupByJob, fileSystem, partitionInfoPath)) {
                final Long numRows = HadoopDruidIndexerConfig.JSON_MAPPER.readValue(Utils.openInputStream(groupByJob, partitionInfoPath), Long.class);
                log.info("Found approximately [%,d] rows in data.", numRows);
                final int numberOfShards = (int) Math.ceil((double) numRows / config.getTargetPartitionSize());
                log.info("Creating [%,d] shards", numberOfShards);
                List<HadoopyShardSpec> actualSpecs = Lists.newArrayListWithExpectedSize(numberOfShards);
                for (int i = 0; i < numberOfShards; ++i) {
                    actualSpecs.add(new HadoopyShardSpec(new HashBasedNumberedShardSpec(i, numberOfShards, i, numberOfShards, null, partitionFunction, HadoopDruidIndexerConfig.JSON_MAPPER), shardCount++));
                    log.info("DateTime[%s], partition[%d], spec[%s]", bucket, i, actualSpecs.get(i));
                }
                shardSpecs.put(bucket.getMillis(), actualSpecs);
            } else {
                log.info("Path[%s] didn't exist!?", partitionInfoPath);
            }
        }
        config.setShardSpecs(shardSpecs);
        log.info("DetermineHashedPartitionsJob took %d millis", (System.currentTimeMillis() - startTime));
        return true;
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) HashBasedNumberedShardSpec(org.apache.druid.timeline.partition.HashBasedNumberedShardSpec) HashedPartitionsSpec(org.apache.druid.indexer.partitions.HashedPartitionsSpec) Configuration(org.apache.hadoop.conf.Configuration) IOException(java.io.IOException) TreeMap(java.util.TreeMap) DateTime(org.joda.time.DateTime) IOException(java.io.IOException) UniformGranularitySpec(org.apache.druid.segment.indexing.granularity.UniformGranularitySpec) HashPartitionFunction(org.apache.druid.timeline.partition.HashPartitionFunction) PartitionsSpec(org.apache.druid.indexer.partitions.PartitionsSpec) HashedPartitionsSpec(org.apache.druid.indexer.partitions.HashedPartitionsSpec) FileSystem(org.apache.hadoop.fs.FileSystem) ISE(org.apache.druid.java.util.common.ISE) ArrayList(java.util.ArrayList) List(java.util.List) Interval(org.joda.time.Interval)

Example 2 with HashPartitionFunction

use of org.apache.druid.timeline.partition.HashPartitionFunction in project druid by druid-io.

the class HadoopDruidDetermineConfigurationJob method run.

@Override
public boolean run() {
    JobHelper.ensurePaths(config);
    if (config.isDeterminingPartitions()) {
        job = createPartitionJob(config);
        config.setHadoopJobIdFileName(hadoopJobIdFile);
        boolean jobSucceeded = JobHelper.runSingleJob(job);
        JobHelper.maybeDeleteIntermediatePath(jobSucceeded, config.getSchema());
        return jobSucceeded;
    } else {
        final PartitionsSpec partitionsSpec = config.getPartitionsSpec();
        final int shardsPerInterval;
        final HashPartitionFunction partitionFunction;
        if (partitionsSpec instanceof HashedPartitionsSpec) {
            final HashedPartitionsSpec hashedPartitionsSpec = (HashedPartitionsSpec) partitionsSpec;
            shardsPerInterval = PartitionsSpec.isEffectivelyNull(hashedPartitionsSpec.getNumShards()) ? 1 : hashedPartitionsSpec.getNumShards();
            partitionFunction = hashedPartitionsSpec.getPartitionFunction();
        } else {
            shardsPerInterval = 1;
            partitionFunction = null;
        }
        Map<Long, List<HadoopyShardSpec>> shardSpecs = new TreeMap<>();
        int shardCount = 0;
        for (Interval segmentGranularity : config.getSegmentGranularIntervals()) {
            DateTime bucket = segmentGranularity.getStart();
            // negative shardsPerInterval means a single shard
            List<HadoopyShardSpec> specs = Lists.newArrayListWithCapacity(shardsPerInterval);
            for (int i = 0; i < shardsPerInterval; i++) {
                specs.add(new HadoopyShardSpec(new HashBasedNumberedShardSpec(i, shardsPerInterval, i, shardsPerInterval, config.getPartitionsSpec().getPartitionDimensions(), partitionFunction, HadoopDruidIndexerConfig.JSON_MAPPER), shardCount++));
            }
            shardSpecs.put(bucket.getMillis(), specs);
            log.info("DateTime[%s], spec[%s]", bucket, specs);
        }
        config.setShardSpecs(shardSpecs);
        return true;
    }
}
Also used : HashBasedNumberedShardSpec(org.apache.druid.timeline.partition.HashBasedNumberedShardSpec) HashedPartitionsSpec(org.apache.druid.indexer.partitions.HashedPartitionsSpec) TreeMap(java.util.TreeMap) DateTime(org.joda.time.DateTime) HashPartitionFunction(org.apache.druid.timeline.partition.HashPartitionFunction) HashedPartitionsSpec(org.apache.druid.indexer.partitions.HashedPartitionsSpec) SingleDimensionPartitionsSpec(org.apache.druid.indexer.partitions.SingleDimensionPartitionsSpec) PartitionsSpec(org.apache.druid.indexer.partitions.PartitionsSpec) List(java.util.List) Interval(org.joda.time.Interval)

Example 3 with HashPartitionFunction

use of org.apache.druid.timeline.partition.HashPartitionFunction in project druid by druid-io.

the class DetermineHashedPartitionsJobTest method testDetermineHashedPartitions.

@Test
public void testDetermineHashedPartitions() {
    DetermineHashedPartitionsJob determineHashedPartitionsJob = new DetermineHashedPartitionsJob(indexerConfig);
    determineHashedPartitionsJob.run();
    HashPartitionFunction expectedFunction = ((HashedPartitionsSpec) indexerConfig.getPartitionsSpec()).getPartitionFunction();
    Map<Long, List<HadoopyShardSpec>> shardSpecs = indexerConfig.getSchema().getTuningConfig().getShardSpecs();
    Assert.assertEquals(expectedNumTimeBuckets, shardSpecs.entrySet().size());
    int i = 0;
    for (Map.Entry<Long, List<HadoopyShardSpec>> entry : shardSpecs.entrySet()) {
        Assert.assertEquals(expectedNumOfShards[i++], entry.getValue().size(), errorMargin);
        for (HadoopyShardSpec eachShardSpec : entry.getValue()) {
            final HashBasedNumberedShardSpec hashShardSpec = (HashBasedNumberedShardSpec) eachShardSpec.getActualSpec();
            Assert.assertEquals(expectedFunction, hashShardSpec.getPartitionFunction());
        }
    }
}
Also used : HashBasedNumberedShardSpec(org.apache.druid.timeline.partition.HashBasedNumberedShardSpec) HashedPartitionsSpec(org.apache.druid.indexer.partitions.HashedPartitionsSpec) HashPartitionFunction(org.apache.druid.timeline.partition.HashPartitionFunction) ImmutableList(com.google.common.collect.ImmutableList) List(java.util.List) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) Test(org.junit.Test)

Aggregations

List (java.util.List)3 HashedPartitionsSpec (org.apache.druid.indexer.partitions.HashedPartitionsSpec)3 HashBasedNumberedShardSpec (org.apache.druid.timeline.partition.HashBasedNumberedShardSpec)3 HashPartitionFunction (org.apache.druid.timeline.partition.HashPartitionFunction)3 TreeMap (java.util.TreeMap)2 PartitionsSpec (org.apache.druid.indexer.partitions.PartitionsSpec)2 DateTime (org.joda.time.DateTime)2 Interval (org.joda.time.Interval)2 ImmutableList (com.google.common.collect.ImmutableList)1 ImmutableMap (com.google.common.collect.ImmutableMap)1 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 Map (java.util.Map)1 SingleDimensionPartitionsSpec (org.apache.druid.indexer.partitions.SingleDimensionPartitionsSpec)1 ISE (org.apache.druid.java.util.common.ISE)1 UniformGranularitySpec (org.apache.druid.segment.indexing.granularity.UniformGranularitySpec)1 Configuration (org.apache.hadoop.conf.Configuration)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 Path (org.apache.hadoop.fs.Path)1 Test (org.junit.Test)1