use of io.druid.indexer.partitions.SingleDimensionPartitionsSpec in project druid by druid-io.
the class DeterminePartitionsJob method run.
public boolean run() {
try {
if (!(config.getPartitionsSpec() instanceof SingleDimensionPartitionsSpec)) {
throw new ISE("DeterminePartitionsJob can only be run for SingleDimensionPartitionsSpec, partitionSpec found [%s]", config.getPartitionsSpec());
}
if (!config.getPartitionsSpec().isAssumeGrouped()) {
final Job groupByJob = Job.getInstance(new Configuration(), String.format("%s-determine_partitions_groupby-%s", config.getDataSource(), config.getIntervals()));
JobHelper.injectSystemProperties(groupByJob);
config.addJobProperties(groupByJob);
groupByJob.setMapperClass(DeterminePartitionsGroupByMapper.class);
groupByJob.setMapOutputKeyClass(BytesWritable.class);
groupByJob.setMapOutputValueClass(NullWritable.class);
groupByJob.setCombinerClass(DeterminePartitionsGroupByReducer.class);
groupByJob.setReducerClass(DeterminePartitionsGroupByReducer.class);
groupByJob.setOutputKeyClass(BytesWritable.class);
groupByJob.setOutputValueClass(NullWritable.class);
groupByJob.setOutputFormatClass(SequenceFileOutputFormat.class);
JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()), JobHelper.distributedClassPath(config.makeIntermediatePath()), groupByJob);
config.addInputPaths(groupByJob);
config.intoConfiguration(groupByJob);
FileOutputFormat.setOutputPath(groupByJob, config.makeGroupedDataDir());
groupByJob.submit();
log.info("Job %s submitted, status available at: %s", groupByJob.getJobName(), groupByJob.getTrackingURL());
if (!groupByJob.waitForCompletion(true)) {
log.error("Job failed: %s", groupByJob.getJobID());
return false;
}
} else {
log.info("Skipping group-by job.");
}
/*
* Read grouped data and determine appropriate partitions.
*/
final Job dimSelectionJob = Job.getInstance(new Configuration(), String.format("%s-determine_partitions_dimselection-%s", config.getDataSource(), config.getIntervals()));
dimSelectionJob.getConfiguration().set("io.sort.record.percent", "0.19");
JobHelper.injectSystemProperties(dimSelectionJob);
config.addJobProperties(dimSelectionJob);
if (!config.getPartitionsSpec().isAssumeGrouped()) {
// Read grouped data from the groupByJob.
dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionPostGroupByMapper.class);
dimSelectionJob.setInputFormatClass(SequenceFileInputFormat.class);
FileInputFormat.addInputPath(dimSelectionJob, config.makeGroupedDataDir());
} else {
// Directly read the source data, since we assume it's already grouped.
dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionAssumeGroupedMapper.class);
config.addInputPaths(dimSelectionJob);
}
SortableBytes.useSortableBytesAsMapOutputKey(dimSelectionJob);
dimSelectionJob.setMapOutputValueClass(Text.class);
dimSelectionJob.setCombinerClass(DeterminePartitionsDimSelectionCombiner.class);
dimSelectionJob.setReducerClass(DeterminePartitionsDimSelectionReducer.class);
dimSelectionJob.setOutputKeyClass(BytesWritable.class);
dimSelectionJob.setOutputValueClass(Text.class);
dimSelectionJob.setOutputFormatClass(DeterminePartitionsDimSelectionOutputFormat.class);
dimSelectionJob.setPartitionerClass(DeterminePartitionsDimSelectionPartitioner.class);
dimSelectionJob.setNumReduceTasks(config.getGranularitySpec().bucketIntervals().get().size());
JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()), JobHelper.distributedClassPath(config.makeIntermediatePath()), dimSelectionJob);
config.intoConfiguration(dimSelectionJob);
FileOutputFormat.setOutputPath(dimSelectionJob, config.makeIntermediatePath());
dimSelectionJob.submit();
log.info("Job %s submitted, status available at: %s", dimSelectionJob.getJobName(), dimSelectionJob.getTrackingURL());
if (!dimSelectionJob.waitForCompletion(true)) {
log.error("Job failed: %s", dimSelectionJob.getJobID().toString());
return false;
}
/*
* Load partitions determined by the previous job.
*/
log.info("Job completed, loading up partitions for intervals[%s].", config.getSegmentGranularIntervals());
FileSystem fileSystem = null;
Map<Long, List<HadoopyShardSpec>> shardSpecs = Maps.newTreeMap();
int shardCount = 0;
for (Interval segmentGranularity : config.getSegmentGranularIntervals().get()) {
final Path partitionInfoPath = config.makeSegmentPartitionInfoPath(segmentGranularity);
if (fileSystem == null) {
fileSystem = partitionInfoPath.getFileSystem(dimSelectionJob.getConfiguration());
}
if (Utils.exists(dimSelectionJob, fileSystem, partitionInfoPath)) {
List<ShardSpec> specs = config.JSON_MAPPER.readValue(Utils.openInputStream(dimSelectionJob, partitionInfoPath), new TypeReference<List<ShardSpec>>() {
});
List<HadoopyShardSpec> actualSpecs = Lists.newArrayListWithExpectedSize(specs.size());
for (int i = 0; i < specs.size(); ++i) {
actualSpecs.add(new HadoopyShardSpec(specs.get(i), shardCount++));
log.info("DateTime[%s], partition[%d], spec[%s]", segmentGranularity, i, actualSpecs.get(i));
}
shardSpecs.put(segmentGranularity.getStartMillis(), actualSpecs);
} else {
log.info("Path[%s] didn't exist!?", partitionInfoPath);
}
}
config.setShardSpecs(shardSpecs);
return true;
} catch (Exception e) {
throw Throwables.propagate(e);
}
}
use of io.druid.indexer.partitions.SingleDimensionPartitionsSpec in project druid by druid-io.
the class HadoopIngestionSpecTest method testPartitionsSpecMaxPartitionSize.
@Test
public void testPartitionsSpecMaxPartitionSize() {
final HadoopIngestionSpec schema;
try {
schema = jsonReadWriteRead("{\n" + " \"tuningConfig\": {\n" + " \"type\": \"hadoop\",\n" + " \"partitionsSpec\": {\n" + " \"type\": \"dimension\",\n" + " \"targetPartitionSize\": 100,\n" + " \"maxPartitionSize\" : 200,\n" + " \"partitionDimension\" : \"foo\"\n" + " }\n" + " }\n" + "}", HadoopIngestionSpec.class);
} catch (Exception e) {
throw Throwables.propagate(e);
}
final PartitionsSpec partitionsSpec = schema.getTuningConfig().getPartitionsSpec();
Assert.assertEquals("isDeterminingPartitions", partitionsSpec.isDeterminingPartitions(), true);
Assert.assertEquals("getTargetPartitionSize", partitionsSpec.getTargetPartitionSize(), 100);
Assert.assertEquals("getMaxPartitionSize", partitionsSpec.getMaxPartitionSize(), 200);
Assert.assertTrue("partitionsSpec", partitionsSpec instanceof SingleDimensionPartitionsSpec);
Assert.assertEquals("getPartitionDimension", ((SingleDimensionPartitionsSpec) partitionsSpec).getPartitionDimension(), "foo");
}
Aggregations