Search in sources :

Example 6 with ShardSpec

use of org.apache.druid.timeline.partition.ShardSpec in project druid by druid-io.

the class DeterminePartitionsJob method run.

@Override
public boolean run() {
    try {
        if (!(config.getPartitionsSpec() instanceof SingleDimensionPartitionsSpec)) {
            throw new ISE("DeterminePartitionsJob can only be run for SingleDimensionPartitionsSpec, partitionSpec found [%s]", config.getPartitionsSpec());
        }
        final SingleDimensionPartitionsSpec partitionsSpec = (SingleDimensionPartitionsSpec) config.getPartitionsSpec();
        if (!partitionsSpec.isAssumeGrouped()) {
            groupByJob = Job.getInstance(new Configuration(), StringUtils.format("%s-determine_partitions_groupby-%s", config.getDataSource(), config.getIntervals()));
            JobHelper.injectSystemProperties(groupByJob.getConfiguration(), config);
            config.addJobProperties(groupByJob);
            groupByJob.setMapperClass(DeterminePartitionsGroupByMapper.class);
            groupByJob.setMapOutputKeyClass(BytesWritable.class);
            groupByJob.setMapOutputValueClass(NullWritable.class);
            groupByJob.setCombinerClass(DeterminePartitionsGroupByReducer.class);
            groupByJob.setReducerClass(DeterminePartitionsGroupByReducer.class);
            groupByJob.setOutputKeyClass(BytesWritable.class);
            groupByJob.setOutputValueClass(NullWritable.class);
            groupByJob.setOutputFormatClass(SequenceFileOutputFormat.class);
            JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()), JobHelper.distributedClassPath(config.makeIntermediatePath()), groupByJob);
            config.addInputPaths(groupByJob);
            config.intoConfiguration(groupByJob);
            FileOutputFormat.setOutputPath(groupByJob, config.makeGroupedDataDir());
            groupByJob.submit();
            log.info("Job %s submitted, status available at: %s", groupByJob.getJobName(), groupByJob.getTrackingURL());
            // Store the jobId in the file
            if (groupByJob.getJobID() != null) {
                JobHelper.writeJobIdToFile(config.getHadoopJobIdFileName(), groupByJob.getJobID().toString());
            }
            try {
                if (!groupByJob.waitForCompletion(true)) {
                    log.error("Job failed: %s", groupByJob.getJobID());
                    failureCause = Utils.getFailureMessage(groupByJob, HadoopDruidIndexerConfig.JSON_MAPPER);
                    return false;
                }
            } catch (IOException ioe) {
                if (!Utils.checkAppSuccessForJobIOException(ioe, groupByJob, config.isUseYarnRMJobStatusFallback())) {
                    throw ioe;
                }
            }
        } else {
            log.info("Skipping group-by job.");
        }
        /*
       * Read grouped data and determine appropriate partitions.
       */
        final Job dimSelectionJob = Job.getInstance(new Configuration(), StringUtils.format("%s-determine_partitions_dimselection-%s", config.getDataSource(), config.getIntervals()));
        dimSelectionJob.getConfiguration().set("io.sort.record.percent", "0.19");
        JobHelper.injectSystemProperties(dimSelectionJob.getConfiguration(), config);
        config.addJobProperties(dimSelectionJob);
        if (!partitionsSpec.isAssumeGrouped()) {
            // Read grouped data from the groupByJob.
            dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionPostGroupByMapper.class);
            dimSelectionJob.setInputFormatClass(SequenceFileInputFormat.class);
            FileInputFormat.addInputPath(dimSelectionJob, config.makeGroupedDataDir());
        } else {
            // Directly read the source data, since we assume it's already grouped.
            dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionAssumeGroupedMapper.class);
            config.addInputPaths(dimSelectionJob);
        }
        SortableBytes.useSortableBytesAsMapOutputKey(dimSelectionJob, DeterminePartitionsDimSelectionPartitioner.class);
        dimSelectionJob.setMapOutputValueClass(Text.class);
        dimSelectionJob.setCombinerClass(DeterminePartitionsDimSelectionCombiner.class);
        dimSelectionJob.setReducerClass(DeterminePartitionsDimSelectionReducer.class);
        dimSelectionJob.setOutputKeyClass(BytesWritable.class);
        dimSelectionJob.setOutputValueClass(Text.class);
        dimSelectionJob.setOutputFormatClass(DeterminePartitionsDimSelectionOutputFormat.class);
        dimSelectionJob.setNumReduceTasks(Iterators.size(config.getGranularitySpec().sortedBucketIntervals().iterator()));
        JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()), JobHelper.distributedClassPath(config.makeIntermediatePath()), dimSelectionJob);
        config.intoConfiguration(dimSelectionJob);
        FileOutputFormat.setOutputPath(dimSelectionJob, config.makeIntermediatePath());
        dimSelectionJob.submit();
        log.info("Job %s submitted, status available at: %s", dimSelectionJob.getJobName(), dimSelectionJob.getTrackingURL());
        // Store the jobId in the file
        if (dimSelectionJob.getJobID() != null) {
            JobHelper.writeJobIdToFile(config.getHadoopJobIdFileName(), dimSelectionJob.getJobID().toString());
        }
        try {
            if (!dimSelectionJob.waitForCompletion(true)) {
                log.error("Job failed: %s", dimSelectionJob.getJobID().toString());
                failureCause = Utils.getFailureMessage(dimSelectionJob, HadoopDruidIndexerConfig.JSON_MAPPER);
                return false;
            }
        } catch (IOException ioe) {
            if (!Utils.checkAppSuccessForJobIOException(ioe, dimSelectionJob, config.isUseYarnRMJobStatusFallback())) {
                throw ioe;
            }
        }
        /*
       * Load partitions determined by the previous job.
       */
        log.info("Job completed, loading up partitions for intervals[%s].", config.getSegmentGranularIntervals());
        FileSystem fileSystem = null;
        Map<Long, List<HadoopyShardSpec>> shardSpecs = new TreeMap<>();
        int shardCount = 0;
        for (Interval segmentGranularity : config.getSegmentGranularIntervals()) {
            final Path partitionInfoPath = config.makeSegmentPartitionInfoPath(segmentGranularity);
            if (fileSystem == null) {
                fileSystem = partitionInfoPath.getFileSystem(dimSelectionJob.getConfiguration());
            }
            if (Utils.exists(dimSelectionJob, fileSystem, partitionInfoPath)) {
                List<ShardSpec> specs = HadoopDruidIndexerConfig.JSON_MAPPER.readValue(Utils.openInputStream(dimSelectionJob, partitionInfoPath), new TypeReference<List<ShardSpec>>() {
                });
                List<HadoopyShardSpec> actualSpecs = Lists.newArrayListWithExpectedSize(specs.size());
                for (int i = 0; i < specs.size(); ++i) {
                    actualSpecs.add(new HadoopyShardSpec(specs.get(i), shardCount++));
                    log.info("DateTime[%s], partition[%d], spec[%s]", segmentGranularity, i, actualSpecs.get(i));
                }
                shardSpecs.put(segmentGranularity.getStartMillis(), actualSpecs);
            } else {
                log.info("Path[%s] didn't exist!?", partitionInfoPath);
            }
        }
        config.setShardSpecs(shardSpecs);
        return true;
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) SingleDimensionPartitionsSpec(org.apache.druid.indexer.partitions.SingleDimensionPartitionsSpec) IOException(java.io.IOException) TreeMap(java.util.TreeMap) ShardSpec(org.apache.druid.timeline.partition.ShardSpec) SingleDimensionShardSpec(org.apache.druid.timeline.partition.SingleDimensionShardSpec) InvalidJobConfException(org.apache.hadoop.mapred.InvalidJobConfException) IOException(java.io.IOException) FileSystem(org.apache.hadoop.fs.FileSystem) ISE(org.apache.druid.java.util.common.ISE) List(java.util.List) ArrayList(java.util.ArrayList) Job(org.apache.hadoop.mapreduce.Job) Interval(org.joda.time.Interval)

Example 7 with ShardSpec

use of org.apache.druid.timeline.partition.ShardSpec in project druid by druid-io.

the class PartialGenericSegmentMergeTask method createIntervalAndIntegerToShardSpec.

private static Table<Interval, Integer, BuildingShardSpec<?>> createIntervalAndIntegerToShardSpec(List<PartitionLocation> partitionLocations) {
    final Table<Interval, Integer, BuildingShardSpec<?>> intervalAndIntegerToShardSpec = HashBasedTable.create();
    partitionLocations.forEach(p -> {
        final ShardSpec currShardSpec = intervalAndIntegerToShardSpec.get(p.getInterval(), p.getBucketId());
        if (currShardSpec == null) {
            intervalAndIntegerToShardSpec.put(p.getInterval(), p.getBucketId(), p.getShardSpec());
        } else {
            if (!p.getShardSpec().equals(currShardSpec)) {
                throw new ISE("interval %s, bucketId %s mismatched shard specs: %s and %s", p.getInterval(), p.getBucketId(), currShardSpec, p.getShardSpec());
            }
        }
    });
    return intervalAndIntegerToShardSpec;
}
Also used : ISE(org.apache.druid.java.util.common.ISE) BuildingShardSpec(org.apache.druid.timeline.partition.BuildingShardSpec) ShardSpec(org.apache.druid.timeline.partition.ShardSpec) BuildingShardSpec(org.apache.druid.timeline.partition.BuildingShardSpec) Interval(org.joda.time.Interval)

Example 8 with ShardSpec

use of org.apache.druid.timeline.partition.ShardSpec in project druid by druid-io.

the class CompactionTaskParallelRunTest method testCompactRangeAndDynamicPartitionedSegments.

@Test
public void testCompactRangeAndDynamicPartitionedSegments() {
    runIndexTask(new SingleDimensionPartitionsSpec(2, null, "dim", false), false);
    runIndexTask(null, true);
    final Builder builder = new Builder(DATA_SOURCE, getSegmentCacheManagerFactory(), RETRY_POLICY_FACTORY);
    final CompactionTask compactionTask = builder.inputSpec(new CompactionIntervalSpec(INTERVAL_TO_INDEX, null)).tuningConfig(AbstractParallelIndexSupervisorTaskTest.DEFAULT_TUNING_CONFIG_FOR_PARALLEL_INDEXING).build();
    final Map<Interval, List<DataSegment>> intervalToSegments = SegmentUtils.groupSegmentsByInterval(runTask(compactionTask));
    Assert.assertEquals(3, intervalToSegments.size());
    Assert.assertEquals(ImmutableSet.of(Intervals.of("2014-01-01T00/PT1H"), Intervals.of("2014-01-01T01/PT1H"), Intervals.of("2014-01-01T02/PT1H")), intervalToSegments.keySet());
    for (Entry<Interval, List<DataSegment>> entry : intervalToSegments.entrySet()) {
        final List<DataSegment> segmentsInInterval = entry.getValue();
        Assert.assertEquals(1, segmentsInInterval.size());
        final ShardSpec shardSpec = segmentsInInterval.get(0).getShardSpec();
        if (lockGranularity == LockGranularity.TIME_CHUNK) {
            Assert.assertSame(NumberedShardSpec.class, shardSpec.getClass());
            final NumberedShardSpec numberedShardSpec = (NumberedShardSpec) shardSpec;
            Assert.assertEquals(0, numberedShardSpec.getPartitionNum());
            Assert.assertEquals(1, numberedShardSpec.getNumCorePartitions());
        } else {
            Assert.assertSame(NumberedOverwriteShardSpec.class, shardSpec.getClass());
            final NumberedOverwriteShardSpec numberedShardSpec = (NumberedOverwriteShardSpec) shardSpec;
            Assert.assertEquals(PartitionIds.NON_ROOT_GEN_START_PARTITION_ID, numberedShardSpec.getPartitionNum());
            Assert.assertEquals(1, numberedShardSpec.getAtomicUpdateGroupSize());
        }
    }
}
Also used : Builder(org.apache.druid.indexing.common.task.CompactionTask.Builder) SingleDimensionPartitionsSpec(org.apache.druid.indexer.partitions.SingleDimensionPartitionsSpec) DataSegment(org.apache.druid.timeline.DataSegment) DimensionRangeShardSpec(org.apache.druid.timeline.partition.DimensionRangeShardSpec) ShardSpec(org.apache.druid.timeline.partition.ShardSpec) NumberedShardSpec(org.apache.druid.timeline.partition.NumberedShardSpec) HashBasedNumberedShardSpec(org.apache.druid.timeline.partition.HashBasedNumberedShardSpec) NumberedOverwriteShardSpec(org.apache.druid.timeline.partition.NumberedOverwriteShardSpec) SingleDimensionShardSpec(org.apache.druid.timeline.partition.SingleDimensionShardSpec) List(java.util.List) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) NumberedOverwriteShardSpec(org.apache.druid.timeline.partition.NumberedOverwriteShardSpec) NumberedShardSpec(org.apache.druid.timeline.partition.NumberedShardSpec) HashBasedNumberedShardSpec(org.apache.druid.timeline.partition.HashBasedNumberedShardSpec) Interval(org.joda.time.Interval) AbstractParallelIndexSupervisorTaskTest(org.apache.druid.indexing.common.task.batch.parallel.AbstractParallelIndexSupervisorTaskTest) Test(org.junit.Test)

Example 9 with ShardSpec

use of org.apache.druid.timeline.partition.ShardSpec in project druid by druid-io.

the class CompactionTaskParallelRunTest method testCompactHashAndDynamicPartitionedSegments.

@Test
public void testCompactHashAndDynamicPartitionedSegments() {
    runIndexTask(new HashedPartitionsSpec(null, 2, null), false);
    runIndexTask(null, true);
    final Builder builder = new Builder(DATA_SOURCE, getSegmentCacheManagerFactory(), RETRY_POLICY_FACTORY);
    final CompactionTask compactionTask = builder.inputSpec(new CompactionIntervalSpec(INTERVAL_TO_INDEX, null)).tuningConfig(AbstractParallelIndexSupervisorTaskTest.DEFAULT_TUNING_CONFIG_FOR_PARALLEL_INDEXING).build();
    final Map<Interval, List<DataSegment>> intervalToSegments = SegmentUtils.groupSegmentsByInterval(runTask(compactionTask));
    Assert.assertEquals(3, intervalToSegments.size());
    Assert.assertEquals(ImmutableSet.of(Intervals.of("2014-01-01T00/PT1H"), Intervals.of("2014-01-01T01/PT1H"), Intervals.of("2014-01-01T02/PT1H")), intervalToSegments.keySet());
    for (Entry<Interval, List<DataSegment>> entry : intervalToSegments.entrySet()) {
        final List<DataSegment> segmentsInInterval = entry.getValue();
        Assert.assertEquals(1, segmentsInInterval.size());
        final ShardSpec shardSpec = segmentsInInterval.get(0).getShardSpec();
        if (lockGranularity == LockGranularity.TIME_CHUNK) {
            Assert.assertSame(NumberedShardSpec.class, shardSpec.getClass());
            final NumberedShardSpec numberedShardSpec = (NumberedShardSpec) shardSpec;
            Assert.assertEquals(0, numberedShardSpec.getPartitionNum());
            Assert.assertEquals(1, numberedShardSpec.getNumCorePartitions());
        } else {
            Assert.assertSame(NumberedOverwriteShardSpec.class, shardSpec.getClass());
            final NumberedOverwriteShardSpec numberedShardSpec = (NumberedOverwriteShardSpec) shardSpec;
            Assert.assertEquals(PartitionIds.NON_ROOT_GEN_START_PARTITION_ID, numberedShardSpec.getPartitionNum());
            Assert.assertEquals(1, numberedShardSpec.getAtomicUpdateGroupSize());
        }
    }
}
Also used : HashedPartitionsSpec(org.apache.druid.indexer.partitions.HashedPartitionsSpec) Builder(org.apache.druid.indexing.common.task.CompactionTask.Builder) DataSegment(org.apache.druid.timeline.DataSegment) DimensionRangeShardSpec(org.apache.druid.timeline.partition.DimensionRangeShardSpec) ShardSpec(org.apache.druid.timeline.partition.ShardSpec) NumberedShardSpec(org.apache.druid.timeline.partition.NumberedShardSpec) HashBasedNumberedShardSpec(org.apache.druid.timeline.partition.HashBasedNumberedShardSpec) NumberedOverwriteShardSpec(org.apache.druid.timeline.partition.NumberedOverwriteShardSpec) SingleDimensionShardSpec(org.apache.druid.timeline.partition.SingleDimensionShardSpec) List(java.util.List) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) NumberedOverwriteShardSpec(org.apache.druid.timeline.partition.NumberedOverwriteShardSpec) NumberedShardSpec(org.apache.druid.timeline.partition.NumberedShardSpec) HashBasedNumberedShardSpec(org.apache.druid.timeline.partition.HashBasedNumberedShardSpec) Interval(org.joda.time.Interval) AbstractParallelIndexSupervisorTaskTest(org.apache.druid.indexing.common.task.batch.parallel.AbstractParallelIndexSupervisorTaskTest) Test(org.junit.Test)

Example 10 with ShardSpec

use of org.apache.druid.timeline.partition.ShardSpec in project druid by druid-io.

the class ShardSpecsTest method testShardSpecSelectionWithNullPartitionDimension.

@Test
public void testShardSpecSelectionWithNullPartitionDimension() {
    HashBucketShardSpec spec1 = new HashBucketShardSpec(0, 2, null, HashPartitionFunction.MURMUR3_32_ABS, jsonMapper);
    HashBucketShardSpec spec2 = new HashBucketShardSpec(1, 2, null, HashPartitionFunction.MURMUR3_32_ABS, jsonMapper);
    Map<Interval, List<BucketNumberedShardSpec<?>>> shardSpecMap = new HashMap<>();
    shardSpecMap.put(Intervals.of("2014-01-01T00:00:00.000Z/2014-01-02T00:00:00.000Z"), ImmutableList.of(spec1, spec2));
    ShardSpecs shardSpecs = new ShardSpecs(shardSpecMap, Granularities.HOUR);
    String visitorId = "visitorId";
    String clientType = "clientType";
    long timestamp1 = DateTimes.of("2014-01-01T00:00:00.000Z").getMillis();
    InputRow row1 = new MapBasedInputRow(timestamp1, Lists.newArrayList(visitorId, clientType), ImmutableMap.of(visitorId, "0", clientType, "iphone"));
    long timestamp2 = DateTimes.of("2014-01-01T00:30:20.456Z").getMillis();
    InputRow row2 = new MapBasedInputRow(timestamp2, Lists.newArrayList(visitorId, clientType), ImmutableMap.of(visitorId, "0", clientType, "iphone"));
    long timestamp3 = DateTimes.of("2014-01-01T10:10:20.456Z").getMillis();
    InputRow row3 = new MapBasedInputRow(timestamp3, Lists.newArrayList(visitorId, clientType), ImmutableMap.of(visitorId, "0", clientType, "iphone"));
    ShardSpec spec3 = shardSpecs.getShardSpec(Intervals.of("2014-01-01T00:00:00.000Z/2014-01-02T00:00:00.000Z"), row1);
    ShardSpec spec4 = shardSpecs.getShardSpec(Intervals.of("2014-01-01T00:00:00.000Z/2014-01-02T00:00:00.000Z"), row2);
    ShardSpec spec5 = shardSpecs.getShardSpec(Intervals.of("2014-01-01T00:00:00.000Z/2014-01-02T00:00:00.000Z"), row3);
    Assert.assertSame(true, spec3 == spec4);
    Assert.assertSame(false, spec3 == spec5);
}
Also used : HashMap(java.util.HashMap) HashBucketShardSpec(org.apache.druid.timeline.partition.HashBucketShardSpec) MapBasedInputRow(org.apache.druid.data.input.MapBasedInputRow) InputRow(org.apache.druid.data.input.InputRow) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) MapBasedInputRow(org.apache.druid.data.input.MapBasedInputRow) ShardSpec(org.apache.druid.timeline.partition.ShardSpec) HashBucketShardSpec(org.apache.druid.timeline.partition.HashBucketShardSpec) BucketNumberedShardSpec(org.apache.druid.timeline.partition.BucketNumberedShardSpec) Interval(org.joda.time.Interval) Test(org.junit.Test)

Aggregations

ShardSpec (org.apache.druid.timeline.partition.ShardSpec)20 Interval (org.joda.time.Interval)13 ArrayList (java.util.ArrayList)8 DataSegment (org.apache.druid.timeline.DataSegment)8 NumberedShardSpec (org.apache.druid.timeline.partition.NumberedShardSpec)8 List (java.util.List)7 HashBasedNumberedShardSpec (org.apache.druid.timeline.partition.HashBasedNumberedShardSpec)6 SingleDimensionShardSpec (org.apache.druid.timeline.partition.SingleDimensionShardSpec)6 Test (org.junit.Test)6 ImmutableList (com.google.common.collect.ImmutableList)5 HashMap (java.util.HashMap)5 ImmutableMap (com.google.common.collect.ImmutableMap)3 Map (java.util.Map)3 TreeMap (java.util.TreeMap)3 ISE (org.apache.druid.java.util.common.ISE)3 BucketNumberedShardSpec (org.apache.druid.timeline.partition.BucketNumberedShardSpec)3 DateTime (org.joda.time.DateTime)3 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)2 IOException (java.io.IOException)2 Collectors (java.util.stream.Collectors)2