use of org.apache.druid.indexer.partitions.PartitionsSpec in project druid by druid-io.
the class PartialRangeSegmentGenerateTask method getPartitionDimensions.
private static List<String> getPartitionDimensions(ParallelIndexIngestionSpec ingestionSpec) {
PartitionsSpec partitionsSpec = ingestionSpec.getTuningConfig().getPartitionsSpec();
Preconditions.checkArgument(partitionsSpec instanceof DimensionRangePartitionsSpec, "%s or %s partitionsSpec required", DimensionRangePartitionsSpec.NAME, SingleDimensionPartitionsSpec.NAME);
DimensionRangePartitionsSpec multiDimPartitionsSpec = (DimensionRangePartitionsSpec) partitionsSpec;
List<String> partitionDimensions = multiDimPartitionsSpec.getPartitionDimensions();
Preconditions.checkNotNull(partitionDimensions, "partitionDimension required");
return partitionDimensions;
}
use of org.apache.druid.indexer.partitions.PartitionsSpec in project druid by druid-io.
the class ITPerfectRollupParallelIndexTest method testIndexData.
@Test(dataProvider = "resources")
public void testIndexData(PartitionsSpec partitionsSpec) throws Exception {
try (final Closeable ignored1 = unloader(INDEX_DATASOURCE + config.getExtraDatasourceNameSuffix());
final Closeable ignored2 = unloader(INDEX_INGEST_SEGMENT_DATASOURCE + config.getExtraDatasourceNameSuffix());
final Closeable ignored3 = unloader(INDEX_DRUID_INPUT_SOURCE_DATASOURCE + config.getExtraDatasourceNameSuffix())) {
boolean forceGuaranteedRollup = partitionsSpec.isForceGuaranteedRollupCompatible();
Assert.assertTrue(forceGuaranteedRollup, "parititionSpec does not support perfect rollup");
final Function<String, String> rollupTransform = spec -> {
try {
spec = StringUtils.replace(spec, "%%FORCE_GUARANTEED_ROLLUP%%", Boolean.toString(true));
spec = StringUtils.replace(spec, "%%SEGMENT_AVAIL_TIMEOUT_MILLIS%%", jsonMapper.writeValueAsString("0"));
return StringUtils.replace(spec, "%%PARTITIONS_SPEC%%", jsonMapper.writeValueAsString(partitionsSpec));
} catch (JsonProcessingException e) {
throw new RuntimeException(e);
}
};
doIndexTest(INDEX_DATASOURCE, INDEX_TASK, rollupTransform, INDEX_QUERIES_RESOURCE, false, true, true, new Pair<>(false, false));
doReindexTest(INDEX_DATASOURCE, INDEX_INGEST_SEGMENT_DATASOURCE, rollupTransform, INDEX_INGEST_SEGMENT_TASK, INDEX_QUERIES_RESOURCE, new Pair<>(false, false));
// with DruidInputSource instead of IngestSegmentFirehose
doReindexTest(INDEX_DATASOURCE, INDEX_DRUID_INPUT_SOURCE_DATASOURCE, rollupTransform, INDEX_DRUID_INPUT_SOURCE_TASK, INDEX_QUERIES_RESOURCE, new Pair<>(false, false));
}
}
use of org.apache.druid.indexer.partitions.PartitionsSpec in project druid by druid-io.
the class ParallelIndexSupervisorTaskSerdeTest method forceGuaranteedRollupWithHashPartitionsMissingNumShards.
@Test
public void forceGuaranteedRollupWithHashPartitionsMissingNumShards() {
Integer numShards = null;
ParallelIndexSupervisorTask task = new ParallelIndexSupervisorTaskBuilder().ingestionSpec(new ParallelIndexIngestionSpecBuilder().forceGuaranteedRollup(true).partitionsSpec(new HashedPartitionsSpec(null, numShards, null)).inputIntervals(INTERVALS).build()).build();
PartitionsSpec partitionsSpec = task.getIngestionSchema().getTuningConfig().getPartitionsSpec();
Assert.assertThat(partitionsSpec, CoreMatchers.instanceOf(HashedPartitionsSpec.class));
}
use of org.apache.druid.indexer.partitions.PartitionsSpec in project druid by druid-io.
the class ParallelIndexSupervisorTaskSerdeTest method forceGuaranteedRollupWithSingleDimPartitionsValid.
@Test
public void forceGuaranteedRollupWithSingleDimPartitionsValid() {
ParallelIndexSupervisorTask task = new ParallelIndexSupervisorTaskBuilder().ingestionSpec(new ParallelIndexIngestionSpecBuilder().forceGuaranteedRollup(true).partitionsSpec(new SingleDimensionPartitionsSpec(1, null, "a", true)).inputIntervals(INTERVALS).build()).build();
PartitionsSpec partitionsSpec = task.getIngestionSchema().getTuningConfig().getPartitionsSpec();
Assert.assertThat(partitionsSpec, CoreMatchers.instanceOf(SingleDimensionPartitionsSpec.class));
}
use of org.apache.druid.indexer.partitions.PartitionsSpec in project druid by druid-io.
the class DetermineHashedPartitionsJob method run.
@Override
public boolean run() {
try {
/*
* Group by (timestamp, dimensions) so we can correctly count dimension values as they would appear
* in the final segment.
*/
startTime = System.currentTimeMillis();
groupByJob = Job.getInstance(new Configuration(), StringUtils.format("%s-determine_partitions_hashed-%s", config.getDataSource(), config.getIntervals()));
JobHelper.injectSystemProperties(groupByJob.getConfiguration(), config);
config.addJobProperties(groupByJob);
groupByJob.setMapperClass(DetermineCardinalityMapper.class);
groupByJob.setMapOutputKeyClass(LongWritable.class);
groupByJob.setMapOutputValueClass(BytesWritable.class);
groupByJob.setReducerClass(DetermineCardinalityReducer.class);
groupByJob.setOutputKeyClass(NullWritable.class);
groupByJob.setOutputValueClass(NullWritable.class);
groupByJob.setOutputFormatClass(SequenceFileOutputFormat.class);
groupByJob.setPartitionerClass(DetermineHashedPartitionsPartitioner.class);
if (config.getInputIntervals().isEmpty()) {
groupByJob.setNumReduceTasks(1);
} else {
groupByJob.setNumReduceTasks(Iterators.size(config.getSegmentGranularIntervals().iterator()));
}
JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()), JobHelper.distributedClassPath(config.makeIntermediatePath()), groupByJob);
config.addInputPaths(groupByJob);
config.intoConfiguration(groupByJob);
FileOutputFormat.setOutputPath(groupByJob, config.makeGroupedDataDir());
groupByJob.submit();
log.info("Job %s submitted, status available at: %s", groupByJob.getJobName(), groupByJob.getTrackingURL());
// Store the jobId in the file
if (groupByJob.getJobID() != null) {
JobHelper.writeJobIdToFile(config.getHadoopJobIdFileName(), groupByJob.getJobID().toString());
}
try {
if (!groupByJob.waitForCompletion(true)) {
log.error("Job failed: %s", groupByJob.getJobID());
failureCause = Utils.getFailureMessage(groupByJob, HadoopDruidIndexerConfig.JSON_MAPPER);
return false;
}
} catch (IOException ioe) {
if (!Utils.checkAppSuccessForJobIOException(ioe, groupByJob, config.isUseYarnRMJobStatusFallback())) {
throw ioe;
}
}
/*
* Load partitions and intervals determined by the previous job.
*/
log.info("Job completed, loading up partitions for intervals[%s].", config.getSegmentGranularIntervals());
FileSystem fileSystem = null;
if (config.getInputIntervals().isEmpty()) {
final Path intervalInfoPath = config.makeIntervalInfoPath();
fileSystem = intervalInfoPath.getFileSystem(groupByJob.getConfiguration());
if (!Utils.exists(groupByJob, fileSystem, intervalInfoPath)) {
throw new ISE("Path[%s] didn't exist!?", intervalInfoPath);
}
List<Interval> intervals = HadoopDruidIndexerConfig.JSON_MAPPER.readValue(Utils.openInputStream(groupByJob, intervalInfoPath), new TypeReference<List<Interval>>() {
});
config.setGranularitySpec(new UniformGranularitySpec(config.getGranularitySpec().getSegmentGranularity(), config.getGranularitySpec().getQueryGranularity(), config.getGranularitySpec().isRollup(), intervals));
log.info("Determined Intervals for Job [%s].", config.getSegmentGranularIntervals());
}
Map<Long, List<HadoopyShardSpec>> shardSpecs = new TreeMap<>(DateTimeComparator.getInstance());
PartitionsSpec partitionsSpec = config.getPartitionsSpec();
if (!(partitionsSpec instanceof HashedPartitionsSpec)) {
throw new ISE("%s is expected, but got %s", HashedPartitionsSpec.class.getName(), partitionsSpec.getClass().getName());
}
HashPartitionFunction partitionFunction = ((HashedPartitionsSpec) partitionsSpec).getPartitionFunction();
int shardCount = 0;
for (Interval segmentGranularity : config.getSegmentGranularIntervals()) {
DateTime bucket = segmentGranularity.getStart();
final Path partitionInfoPath = config.makeSegmentPartitionInfoPath(segmentGranularity);
if (fileSystem == null) {
fileSystem = partitionInfoPath.getFileSystem(groupByJob.getConfiguration());
}
if (Utils.exists(groupByJob, fileSystem, partitionInfoPath)) {
final Long numRows = HadoopDruidIndexerConfig.JSON_MAPPER.readValue(Utils.openInputStream(groupByJob, partitionInfoPath), Long.class);
log.info("Found approximately [%,d] rows in data.", numRows);
final int numberOfShards = (int) Math.ceil((double) numRows / config.getTargetPartitionSize());
log.info("Creating [%,d] shards", numberOfShards);
List<HadoopyShardSpec> actualSpecs = Lists.newArrayListWithExpectedSize(numberOfShards);
for (int i = 0; i < numberOfShards; ++i) {
actualSpecs.add(new HadoopyShardSpec(new HashBasedNumberedShardSpec(i, numberOfShards, i, numberOfShards, null, partitionFunction, HadoopDruidIndexerConfig.JSON_MAPPER), shardCount++));
log.info("DateTime[%s], partition[%d], spec[%s]", bucket, i, actualSpecs.get(i));
}
shardSpecs.put(bucket.getMillis(), actualSpecs);
} else {
log.info("Path[%s] didn't exist!?", partitionInfoPath);
}
}
config.setShardSpecs(shardSpecs);
log.info("DetermineHashedPartitionsJob took %d millis", (System.currentTimeMillis() - startTime));
return true;
} catch (Exception e) {
throw new RuntimeException(e);
}
}
Aggregations