Search in sources :

Example 1 with SingleDimensionPartitionsSpec

use of io.druid.indexer.partitions.SingleDimensionPartitionsSpec in project druid by druid-io.

the class DeterminePartitionsJob method run.

public boolean run() {
    try {
        if (!(config.getPartitionsSpec() instanceof SingleDimensionPartitionsSpec)) {
            throw new ISE("DeterminePartitionsJob can only be run for SingleDimensionPartitionsSpec, partitionSpec found [%s]", config.getPartitionsSpec());
        }
        if (!config.getPartitionsSpec().isAssumeGrouped()) {
            final Job groupByJob = Job.getInstance(new Configuration(), String.format("%s-determine_partitions_groupby-%s", config.getDataSource(), config.getIntervals()));
            JobHelper.injectSystemProperties(groupByJob);
            config.addJobProperties(groupByJob);
            groupByJob.setMapperClass(DeterminePartitionsGroupByMapper.class);
            groupByJob.setMapOutputKeyClass(BytesWritable.class);
            groupByJob.setMapOutputValueClass(NullWritable.class);
            groupByJob.setCombinerClass(DeterminePartitionsGroupByReducer.class);
            groupByJob.setReducerClass(DeterminePartitionsGroupByReducer.class);
            groupByJob.setOutputKeyClass(BytesWritable.class);
            groupByJob.setOutputValueClass(NullWritable.class);
            groupByJob.setOutputFormatClass(SequenceFileOutputFormat.class);
            JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()), JobHelper.distributedClassPath(config.makeIntermediatePath()), groupByJob);
            config.addInputPaths(groupByJob);
            config.intoConfiguration(groupByJob);
            FileOutputFormat.setOutputPath(groupByJob, config.makeGroupedDataDir());
            groupByJob.submit();
            log.info("Job %s submitted, status available at: %s", groupByJob.getJobName(), groupByJob.getTrackingURL());
            if (!groupByJob.waitForCompletion(true)) {
                log.error("Job failed: %s", groupByJob.getJobID());
                return false;
            }
        } else {
            log.info("Skipping group-by job.");
        }
        /*
       * Read grouped data and determine appropriate partitions.
       */
        final Job dimSelectionJob = Job.getInstance(new Configuration(), String.format("%s-determine_partitions_dimselection-%s", config.getDataSource(), config.getIntervals()));
        dimSelectionJob.getConfiguration().set("io.sort.record.percent", "0.19");
        JobHelper.injectSystemProperties(dimSelectionJob);
        config.addJobProperties(dimSelectionJob);
        if (!config.getPartitionsSpec().isAssumeGrouped()) {
            // Read grouped data from the groupByJob.
            dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionPostGroupByMapper.class);
            dimSelectionJob.setInputFormatClass(SequenceFileInputFormat.class);
            FileInputFormat.addInputPath(dimSelectionJob, config.makeGroupedDataDir());
        } else {
            // Directly read the source data, since we assume it's already grouped.
            dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionAssumeGroupedMapper.class);
            config.addInputPaths(dimSelectionJob);
        }
        SortableBytes.useSortableBytesAsMapOutputKey(dimSelectionJob);
        dimSelectionJob.setMapOutputValueClass(Text.class);
        dimSelectionJob.setCombinerClass(DeterminePartitionsDimSelectionCombiner.class);
        dimSelectionJob.setReducerClass(DeterminePartitionsDimSelectionReducer.class);
        dimSelectionJob.setOutputKeyClass(BytesWritable.class);
        dimSelectionJob.setOutputValueClass(Text.class);
        dimSelectionJob.setOutputFormatClass(DeterminePartitionsDimSelectionOutputFormat.class);
        dimSelectionJob.setPartitionerClass(DeterminePartitionsDimSelectionPartitioner.class);
        dimSelectionJob.setNumReduceTasks(config.getGranularitySpec().bucketIntervals().get().size());
        JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()), JobHelper.distributedClassPath(config.makeIntermediatePath()), dimSelectionJob);
        config.intoConfiguration(dimSelectionJob);
        FileOutputFormat.setOutputPath(dimSelectionJob, config.makeIntermediatePath());
        dimSelectionJob.submit();
        log.info("Job %s submitted, status available at: %s", dimSelectionJob.getJobName(), dimSelectionJob.getTrackingURL());
        if (!dimSelectionJob.waitForCompletion(true)) {
            log.error("Job failed: %s", dimSelectionJob.getJobID().toString());
            return false;
        }
        /*
       * Load partitions determined by the previous job.
       */
        log.info("Job completed, loading up partitions for intervals[%s].", config.getSegmentGranularIntervals());
        FileSystem fileSystem = null;
        Map<Long, List<HadoopyShardSpec>> shardSpecs = Maps.newTreeMap();
        int shardCount = 0;
        for (Interval segmentGranularity : config.getSegmentGranularIntervals().get()) {
            final Path partitionInfoPath = config.makeSegmentPartitionInfoPath(segmentGranularity);
            if (fileSystem == null) {
                fileSystem = partitionInfoPath.getFileSystem(dimSelectionJob.getConfiguration());
            }
            if (Utils.exists(dimSelectionJob, fileSystem, partitionInfoPath)) {
                List<ShardSpec> specs = config.JSON_MAPPER.readValue(Utils.openInputStream(dimSelectionJob, partitionInfoPath), new TypeReference<List<ShardSpec>>() {
                });
                List<HadoopyShardSpec> actualSpecs = Lists.newArrayListWithExpectedSize(specs.size());
                for (int i = 0; i < specs.size(); ++i) {
                    actualSpecs.add(new HadoopyShardSpec(specs.get(i), shardCount++));
                    log.info("DateTime[%s], partition[%d], spec[%s]", segmentGranularity, i, actualSpecs.get(i));
                }
                shardSpecs.put(segmentGranularity.getStartMillis(), actualSpecs);
            } else {
                log.info("Path[%s] didn't exist!?", partitionInfoPath);
            }
        }
        config.setShardSpecs(shardSpecs);
        return true;
    } catch (Exception e) {
        throw Throwables.propagate(e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) SingleDimensionPartitionsSpec(io.druid.indexer.partitions.SingleDimensionPartitionsSpec) SingleDimensionShardSpec(io.druid.timeline.partition.SingleDimensionShardSpec) NoneShardSpec(io.druid.timeline.partition.NoneShardSpec) ShardSpec(io.druid.timeline.partition.ShardSpec) InvalidJobConfException(org.apache.hadoop.mapred.InvalidJobConfException) IOException(java.io.IOException) FileSystem(org.apache.hadoop.fs.FileSystem) ISE(io.druid.java.util.common.ISE) List(java.util.List) Job(org.apache.hadoop.mapreduce.Job) Interval(org.joda.time.Interval)

Example 2 with SingleDimensionPartitionsSpec

use of io.druid.indexer.partitions.SingleDimensionPartitionsSpec in project druid by druid-io.

the class HadoopIngestionSpecTest method testPartitionsSpecMaxPartitionSize.

@Test
public void testPartitionsSpecMaxPartitionSize() {
    final HadoopIngestionSpec schema;
    try {
        schema = jsonReadWriteRead("{\n" + "    \"tuningConfig\": {\n" + "        \"type\": \"hadoop\",\n" + "        \"partitionsSpec\": {\n" + "            \"type\": \"dimension\",\n" + "            \"targetPartitionSize\": 100,\n" + "            \"maxPartitionSize\" : 200,\n" + "            \"partitionDimension\" : \"foo\"\n" + "        }\n" + "    }\n" + "}", HadoopIngestionSpec.class);
    } catch (Exception e) {
        throw Throwables.propagate(e);
    }
    final PartitionsSpec partitionsSpec = schema.getTuningConfig().getPartitionsSpec();
    Assert.assertEquals("isDeterminingPartitions", partitionsSpec.isDeterminingPartitions(), true);
    Assert.assertEquals("getTargetPartitionSize", partitionsSpec.getTargetPartitionSize(), 100);
    Assert.assertEquals("getMaxPartitionSize", partitionsSpec.getMaxPartitionSize(), 200);
    Assert.assertTrue("partitionsSpec", partitionsSpec instanceof SingleDimensionPartitionsSpec);
    Assert.assertEquals("getPartitionDimension", ((SingleDimensionPartitionsSpec) partitionsSpec).getPartitionDimension(), "foo");
}
Also used : HashedPartitionsSpec(io.druid.indexer.partitions.HashedPartitionsSpec) SingleDimensionPartitionsSpec(io.druid.indexer.partitions.SingleDimensionPartitionsSpec) PartitionsSpec(io.druid.indexer.partitions.PartitionsSpec) SingleDimensionPartitionsSpec(io.druid.indexer.partitions.SingleDimensionPartitionsSpec) JsonProcessingException(com.fasterxml.jackson.core.JsonProcessingException) Test(org.junit.Test)

Aggregations

SingleDimensionPartitionsSpec (io.druid.indexer.partitions.SingleDimensionPartitionsSpec)2 JsonProcessingException (com.fasterxml.jackson.core.JsonProcessingException)1 HashedPartitionsSpec (io.druid.indexer.partitions.HashedPartitionsSpec)1 PartitionsSpec (io.druid.indexer.partitions.PartitionsSpec)1 ISE (io.druid.java.util.common.ISE)1 NoneShardSpec (io.druid.timeline.partition.NoneShardSpec)1 ShardSpec (io.druid.timeline.partition.ShardSpec)1 SingleDimensionShardSpec (io.druid.timeline.partition.SingleDimensionShardSpec)1 IOException (java.io.IOException)1 List (java.util.List)1 Configuration (org.apache.hadoop.conf.Configuration)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 Path (org.apache.hadoop.fs.Path)1 InvalidJobConfException (org.apache.hadoop.mapred.InvalidJobConfException)1 Job (org.apache.hadoop.mapreduce.Job)1 Interval (org.joda.time.Interval)1 Test (org.junit.Test)1