use of org.apache.druid.indexer.partitions.SingleDimensionPartitionsSpec in project druid by druid-io.
the class CompactSegmentsTest method testRunWithLockedIntervalsNoSkip.
@Test
public void testRunWithLockedIntervalsNoSkip() {
Mockito.when(COORDINATOR_CONFIG.getCompactionSkipLockedIntervals()).thenReturn(false);
final TestDruidLeaderClient leaderClient = new TestDruidLeaderClient(JSON_MAPPER);
leaderClient.start();
HttpIndexingServiceClient indexingServiceClient = new HttpIndexingServiceClient(JSON_MAPPER, leaderClient);
// Lock all intervals for all the dataSources
final String datasource0 = DATA_SOURCE_PREFIX + 0;
leaderClient.lockedIntervals.computeIfAbsent(datasource0, k -> new ArrayList<>()).add(Intervals.of("2017/2018"));
final String datasource1 = DATA_SOURCE_PREFIX + 1;
leaderClient.lockedIntervals.computeIfAbsent(datasource1, k -> new ArrayList<>()).add(Intervals.of("2017/2018"));
final String datasource2 = DATA_SOURCE_PREFIX + 2;
leaderClient.lockedIntervals.computeIfAbsent(datasource2, k -> new ArrayList<>()).add(Intervals.of("2017/2018"));
// Verify that no locked intervals are skipped
CompactSegments compactSegments = new CompactSegments(COORDINATOR_CONFIG, JSON_MAPPER, indexingServiceClient);
int maxTaskSlots = partitionsSpec instanceof SingleDimensionPartitionsSpec ? 5 : 3;
final CoordinatorStats stats = doCompactSegments(compactSegments, createCompactionConfigs(1), maxTaskSlots);
Assert.assertEquals(3, stats.getGlobalStat(CompactSegments.COMPACTION_TASK_COUNT));
Assert.assertEquals(3, leaderClient.submittedCompactionTasks.size());
leaderClient.submittedCompactionTasks.forEach(task -> {
System.out.println(task.getDataSource() + " : " + task.getIoConfig().getInputSpec().getInterval());
});
// Verify that tasks are submitted for the latest interval of each dataSource
final Map<String, Interval> datasourceToInterval = new HashMap<>();
leaderClient.submittedCompactionTasks.forEach(task -> datasourceToInterval.put(task.getDataSource(), task.getIoConfig().getInputSpec().getInterval()));
Assert.assertEquals(Intervals.of("2017-01-09T00:00:00Z/2017-01-09T12:00:00Z"), datasourceToInterval.get(datasource0));
Assert.assertEquals(Intervals.of("2017-01-09T00:00:00Z/2017-01-09T12:00:00Z"), datasourceToInterval.get(datasource1));
Assert.assertEquals(Intervals.of("2017-01-09T00:00:00Z/2017-01-09T12:00:00Z"), datasourceToInterval.get(datasource2));
}
use of org.apache.druid.indexer.partitions.SingleDimensionPartitionsSpec in project druid by druid-io.
the class HadoopDruidDetermineConfigurationJobTest method testRunWithSingleDimensionPartitionsSpecCreateHashBasedNumberedShardSpecWithoutHashPartitionFunction.
@Test
public void testRunWithSingleDimensionPartitionsSpecCreateHashBasedNumberedShardSpecWithoutHashPartitionFunction() {
final Set<Interval> intervals = ImmutableSet.of(Intervals.of("2020-01-01/P1D"), Intervals.of("2020-01-02/P1D"), Intervals.of("2020-01-03/P1D"));
final SingleDimensionPartitionsSpec partitionsSpec = new SingleDimensionPartitionsSpec(1000, null, "dim", false);
final HadoopDruidIndexerConfig config = Mockito.mock(HadoopDruidIndexerConfig.class);
Mockito.when(config.isDeterminingPartitions()).thenReturn(false);
Mockito.when(config.getPartitionsSpec()).thenReturn(partitionsSpec);
Mockito.when(config.getSegmentGranularIntervals()).thenReturn(intervals);
final ArgumentCaptor<Map<Long, List<HadoopyShardSpec>>> resultCaptor = ArgumentCaptor.forClass(Map.class);
Mockito.doNothing().when(config).setShardSpecs(resultCaptor.capture());
final HadoopDruidDetermineConfigurationJob job = new HadoopDruidDetermineConfigurationJob(config);
Assert.assertTrue(job.run());
final Map<Long, List<HadoopyShardSpec>> shardSpecs = resultCaptor.getValue();
Assert.assertEquals(3, shardSpecs.size());
for (Interval interval : intervals) {
final List<HadoopyShardSpec> shardSpecsPerInterval = shardSpecs.get(interval.getStartMillis());
Assert.assertEquals(1, shardSpecsPerInterval.size());
Assert.assertEquals(new HashBasedNumberedShardSpec(0, shardSpecsPerInterval.size(), 0, shardSpecsPerInterval.size(), ImmutableList.of("dim"), null, new ObjectMapper()), shardSpecsPerInterval.get(0).getActualSpec());
}
}
use of org.apache.druid.indexer.partitions.SingleDimensionPartitionsSpec in project druid by druid-io.
the class DeterminePartitionsJob method run.
@Override
public boolean run() {
try {
if (!(config.getPartitionsSpec() instanceof SingleDimensionPartitionsSpec)) {
throw new ISE("DeterminePartitionsJob can only be run for SingleDimensionPartitionsSpec, partitionSpec found [%s]", config.getPartitionsSpec());
}
final SingleDimensionPartitionsSpec partitionsSpec = (SingleDimensionPartitionsSpec) config.getPartitionsSpec();
if (!partitionsSpec.isAssumeGrouped()) {
groupByJob = Job.getInstance(new Configuration(), StringUtils.format("%s-determine_partitions_groupby-%s", config.getDataSource(), config.getIntervals()));
JobHelper.injectSystemProperties(groupByJob.getConfiguration(), config);
config.addJobProperties(groupByJob);
groupByJob.setMapperClass(DeterminePartitionsGroupByMapper.class);
groupByJob.setMapOutputKeyClass(BytesWritable.class);
groupByJob.setMapOutputValueClass(NullWritable.class);
groupByJob.setCombinerClass(DeterminePartitionsGroupByReducer.class);
groupByJob.setReducerClass(DeterminePartitionsGroupByReducer.class);
groupByJob.setOutputKeyClass(BytesWritable.class);
groupByJob.setOutputValueClass(NullWritable.class);
groupByJob.setOutputFormatClass(SequenceFileOutputFormat.class);
JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()), JobHelper.distributedClassPath(config.makeIntermediatePath()), groupByJob);
config.addInputPaths(groupByJob);
config.intoConfiguration(groupByJob);
FileOutputFormat.setOutputPath(groupByJob, config.makeGroupedDataDir());
groupByJob.submit();
log.info("Job %s submitted, status available at: %s", groupByJob.getJobName(), groupByJob.getTrackingURL());
// Store the jobId in the file
if (groupByJob.getJobID() != null) {
JobHelper.writeJobIdToFile(config.getHadoopJobIdFileName(), groupByJob.getJobID().toString());
}
try {
if (!groupByJob.waitForCompletion(true)) {
log.error("Job failed: %s", groupByJob.getJobID());
failureCause = Utils.getFailureMessage(groupByJob, HadoopDruidIndexerConfig.JSON_MAPPER);
return false;
}
} catch (IOException ioe) {
if (!Utils.checkAppSuccessForJobIOException(ioe, groupByJob, config.isUseYarnRMJobStatusFallback())) {
throw ioe;
}
}
} else {
log.info("Skipping group-by job.");
}
/*
* Read grouped data and determine appropriate partitions.
*/
final Job dimSelectionJob = Job.getInstance(new Configuration(), StringUtils.format("%s-determine_partitions_dimselection-%s", config.getDataSource(), config.getIntervals()));
dimSelectionJob.getConfiguration().set("io.sort.record.percent", "0.19");
JobHelper.injectSystemProperties(dimSelectionJob.getConfiguration(), config);
config.addJobProperties(dimSelectionJob);
if (!partitionsSpec.isAssumeGrouped()) {
// Read grouped data from the groupByJob.
dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionPostGroupByMapper.class);
dimSelectionJob.setInputFormatClass(SequenceFileInputFormat.class);
FileInputFormat.addInputPath(dimSelectionJob, config.makeGroupedDataDir());
} else {
// Directly read the source data, since we assume it's already grouped.
dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionAssumeGroupedMapper.class);
config.addInputPaths(dimSelectionJob);
}
SortableBytes.useSortableBytesAsMapOutputKey(dimSelectionJob, DeterminePartitionsDimSelectionPartitioner.class);
dimSelectionJob.setMapOutputValueClass(Text.class);
dimSelectionJob.setCombinerClass(DeterminePartitionsDimSelectionCombiner.class);
dimSelectionJob.setReducerClass(DeterminePartitionsDimSelectionReducer.class);
dimSelectionJob.setOutputKeyClass(BytesWritable.class);
dimSelectionJob.setOutputValueClass(Text.class);
dimSelectionJob.setOutputFormatClass(DeterminePartitionsDimSelectionOutputFormat.class);
dimSelectionJob.setNumReduceTasks(Iterators.size(config.getGranularitySpec().sortedBucketIntervals().iterator()));
JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()), JobHelper.distributedClassPath(config.makeIntermediatePath()), dimSelectionJob);
config.intoConfiguration(dimSelectionJob);
FileOutputFormat.setOutputPath(dimSelectionJob, config.makeIntermediatePath());
dimSelectionJob.submit();
log.info("Job %s submitted, status available at: %s", dimSelectionJob.getJobName(), dimSelectionJob.getTrackingURL());
// Store the jobId in the file
if (dimSelectionJob.getJobID() != null) {
JobHelper.writeJobIdToFile(config.getHadoopJobIdFileName(), dimSelectionJob.getJobID().toString());
}
try {
if (!dimSelectionJob.waitForCompletion(true)) {
log.error("Job failed: %s", dimSelectionJob.getJobID().toString());
failureCause = Utils.getFailureMessage(dimSelectionJob, HadoopDruidIndexerConfig.JSON_MAPPER);
return false;
}
} catch (IOException ioe) {
if (!Utils.checkAppSuccessForJobIOException(ioe, dimSelectionJob, config.isUseYarnRMJobStatusFallback())) {
throw ioe;
}
}
/*
* Load partitions determined by the previous job.
*/
log.info("Job completed, loading up partitions for intervals[%s].", config.getSegmentGranularIntervals());
FileSystem fileSystem = null;
Map<Long, List<HadoopyShardSpec>> shardSpecs = new TreeMap<>();
int shardCount = 0;
for (Interval segmentGranularity : config.getSegmentGranularIntervals()) {
final Path partitionInfoPath = config.makeSegmentPartitionInfoPath(segmentGranularity);
if (fileSystem == null) {
fileSystem = partitionInfoPath.getFileSystem(dimSelectionJob.getConfiguration());
}
if (Utils.exists(dimSelectionJob, fileSystem, partitionInfoPath)) {
List<ShardSpec> specs = HadoopDruidIndexerConfig.JSON_MAPPER.readValue(Utils.openInputStream(dimSelectionJob, partitionInfoPath), new TypeReference<List<ShardSpec>>() {
});
List<HadoopyShardSpec> actualSpecs = Lists.newArrayListWithExpectedSize(specs.size());
for (int i = 0; i < specs.size(); ++i) {
actualSpecs.add(new HadoopyShardSpec(specs.get(i), shardCount++));
log.info("DateTime[%s], partition[%d], spec[%s]", segmentGranularity, i, actualSpecs.get(i));
}
shardSpecs.put(segmentGranularity.getStartMillis(), actualSpecs);
} else {
log.info("Path[%s] didn't exist!?", partitionInfoPath);
}
}
config.setShardSpecs(shardSpecs);
return true;
} catch (Exception e) {
throw new RuntimeException(e);
}
}
use of org.apache.druid.indexer.partitions.SingleDimensionPartitionsSpec in project druid by druid-io.
the class HadoopIngestionSpecTest method testPartitionsSpecMaxPartitionSize.
@Test
public void testPartitionsSpecMaxPartitionSize() {
final HadoopIngestionSpec schema;
try {
schema = jsonReadWriteRead("{\n" + " \"tuningConfig\": {\n" + " \"type\": \"hadoop\",\n" + " \"partitionsSpec\": {\n" + " \"type\": \"dimension\",\n" + " \"targetPartitionSize\": 100,\n" + " \"maxPartitionSize\" : null,\n" + " \"partitionDimension\" : \"foo\"\n" + " }\n" + " }\n" + "}", HadoopIngestionSpec.class);
} catch (Exception e) {
throw new RuntimeException(e);
}
PartitionsSpec partitionsSpec = schema.getTuningConfig().getPartitionsSpec();
Assert.assertTrue("partitionsSpec", partitionsSpec instanceof SingleDimensionPartitionsSpec);
SingleDimensionPartitionsSpec singleDimensionPartitionsSpec = (SingleDimensionPartitionsSpec) partitionsSpec;
Assert.assertTrue("isDeterminingPartitions", singleDimensionPartitionsSpec.needsDeterminePartitions(true));
Assert.assertEquals("getTargetPartitionSize", 100, singleDimensionPartitionsSpec.getTargetRowsPerSegment().intValue());
Assert.assertEquals("getMaxPartitionSize", 150, singleDimensionPartitionsSpec.getMaxRowsPerSegment().intValue());
Assert.assertEquals("getPartitionDimension", "foo", singleDimensionPartitionsSpec.getPartitionDimension());
}
use of org.apache.druid.indexer.partitions.SingleDimensionPartitionsSpec in project druid by druid-io.
the class CompactionTaskParallelRunTest method testCompactRangeAndDynamicPartitionedSegments.
@Test
public void testCompactRangeAndDynamicPartitionedSegments() {
runIndexTask(new SingleDimensionPartitionsSpec(2, null, "dim", false), false);
runIndexTask(null, true);
final Builder builder = new Builder(DATA_SOURCE, getSegmentCacheManagerFactory(), RETRY_POLICY_FACTORY);
final CompactionTask compactionTask = builder.inputSpec(new CompactionIntervalSpec(INTERVAL_TO_INDEX, null)).tuningConfig(AbstractParallelIndexSupervisorTaskTest.DEFAULT_TUNING_CONFIG_FOR_PARALLEL_INDEXING).build();
final Map<Interval, List<DataSegment>> intervalToSegments = SegmentUtils.groupSegmentsByInterval(runTask(compactionTask));
Assert.assertEquals(3, intervalToSegments.size());
Assert.assertEquals(ImmutableSet.of(Intervals.of("2014-01-01T00/PT1H"), Intervals.of("2014-01-01T01/PT1H"), Intervals.of("2014-01-01T02/PT1H")), intervalToSegments.keySet());
for (Entry<Interval, List<DataSegment>> entry : intervalToSegments.entrySet()) {
final List<DataSegment> segmentsInInterval = entry.getValue();
Assert.assertEquals(1, segmentsInInterval.size());
final ShardSpec shardSpec = segmentsInInterval.get(0).getShardSpec();
if (lockGranularity == LockGranularity.TIME_CHUNK) {
Assert.assertSame(NumberedShardSpec.class, shardSpec.getClass());
final NumberedShardSpec numberedShardSpec = (NumberedShardSpec) shardSpec;
Assert.assertEquals(0, numberedShardSpec.getPartitionNum());
Assert.assertEquals(1, numberedShardSpec.getNumCorePartitions());
} else {
Assert.assertSame(NumberedOverwriteShardSpec.class, shardSpec.getClass());
final NumberedOverwriteShardSpec numberedShardSpec = (NumberedOverwriteShardSpec) shardSpec;
Assert.assertEquals(PartitionIds.NON_ROOT_GEN_START_PARTITION_ID, numberedShardSpec.getPartitionNum());
Assert.assertEquals(1, numberedShardSpec.getAtomicUpdateGroupSize());
}
}
}
Aggregations