Search in sources :

Example 1 with HashBasedNumberedShardSpec

use of io.druid.timeline.partition.HashBasedNumberedShardSpec in project druid by druid-io.

the class OrcIndexGeneratorJobTest method verifyJob.

private void verifyJob(IndexGeneratorJob job) throws IOException {
    JobHelper.runJobs(ImmutableList.<Jobby>of(job), config);
    int segmentNum = 0;
    for (DateTime currTime = interval.getStart(); currTime.isBefore(interval.getEnd()); currTime = currTime.plusDays(1)) {
        Integer[][] shardInfo = shardInfoForEachSegment[segmentNum++];
        File segmentOutputFolder = new File(String.format("%s/%s/%s_%s/%s", config.getSchema().getIOConfig().getSegmentOutputPath(), config.getSchema().getDataSchema().getDataSource(), currTime.toString(), currTime.plusDays(1).toString(), config.getSchema().getTuningConfig().getVersion()));
        Assert.assertTrue(segmentOutputFolder.exists());
        Assert.assertEquals(shardInfo.length, segmentOutputFolder.list().length);
        int rowCount = 0;
        for (int partitionNum = 0; partitionNum < shardInfo.length; ++partitionNum) {
            File individualSegmentFolder = new File(segmentOutputFolder, Integer.toString(partitionNum));
            Assert.assertTrue(individualSegmentFolder.exists());
            File descriptor = new File(individualSegmentFolder, "descriptor.json");
            File indexZip = new File(individualSegmentFolder, "index.zip");
            Assert.assertTrue(descriptor.exists());
            Assert.assertTrue(indexZip.exists());
            DataSegment dataSegment = mapper.readValue(descriptor, DataSegment.class);
            Assert.assertEquals(config.getSchema().getTuningConfig().getVersion(), dataSegment.getVersion());
            Assert.assertEquals(new Interval(currTime, currTime.plusDays(1)), dataSegment.getInterval());
            Assert.assertEquals("local", dataSegment.getLoadSpec().get("type"));
            Assert.assertEquals(indexZip.getCanonicalPath(), dataSegment.getLoadSpec().get("path"));
            Assert.assertEquals(Integer.valueOf(9), dataSegment.getBinaryVersion());
            Assert.assertEquals(dataSourceName, dataSegment.getDataSource());
            Assert.assertTrue(dataSegment.getDimensions().size() == 1);
            String[] dimensions = dataSegment.getDimensions().toArray(new String[dataSegment.getDimensions().size()]);
            Arrays.sort(dimensions);
            Assert.assertEquals("host", dimensions[0]);
            Assert.assertEquals("visited_num", dataSegment.getMetrics().get(0));
            Assert.assertEquals("unique_hosts", dataSegment.getMetrics().get(1));
            Integer[] hashShardInfo = shardInfo[partitionNum];
            HashBasedNumberedShardSpec spec = (HashBasedNumberedShardSpec) dataSegment.getShardSpec();
            Assert.assertEquals((int) hashShardInfo[0], spec.getPartitionNum());
            Assert.assertEquals((int) hashShardInfo[1], spec.getPartitions());
            File dir = Files.createTempDir();
            unzip(indexZip, dir);
            QueryableIndex index = HadoopDruidIndexerConfig.INDEX_IO.loadIndex(dir);
            QueryableIndexIndexableAdapter adapter = new QueryableIndexIndexableAdapter(index);
            for (Rowboat row : adapter.getRows()) {
                Object[] metrics = row.getMetrics();
                rowCount++;
                Assert.assertTrue(metrics.length == 2);
            }
        }
        Assert.assertEquals(rowCount, data.size());
    }
}
Also used : HashBasedNumberedShardSpec(io.druid.timeline.partition.HashBasedNumberedShardSpec) DataSegment(io.druid.timeline.DataSegment) DateTime(org.joda.time.DateTime) QueryableIndexIndexableAdapter(io.druid.segment.QueryableIndexIndexableAdapter) QueryableIndex(io.druid.segment.QueryableIndex) OrcFile(org.apache.orc.OrcFile) File(java.io.File) Rowboat(io.druid.segment.Rowboat) Interval(org.joda.time.Interval)

Example 2 with HashBasedNumberedShardSpec

use of io.druid.timeline.partition.HashBasedNumberedShardSpec in project druid by druid-io.

the class IndexTask method determineShardSpecs.

/**
   * Determines the number of shards for each interval using a hash of queryGranularity timestamp + all dimensions (i.e
   * hash-based partitioning). In the future we may want to also support single-dimension partitioning.
   */
private Map<Interval, List<ShardSpec>> determineShardSpecs(final TaskToolbox toolbox, final FirehoseFactory firehoseFactory) throws IOException {
    final ObjectMapper jsonMapper = toolbox.getObjectMapper();
    final GranularitySpec granularitySpec = ingestionSchema.getDataSchema().getGranularitySpec();
    final Granularity queryGranularity = granularitySpec.getQueryGranularity();
    final boolean determineNumPartitions = ingestionSchema.getTuningConfig().getNumShards() == null;
    final boolean determineIntervals = !ingestionSchema.getDataSchema().getGranularitySpec().bucketIntervals().isPresent();
    final Map<Interval, List<ShardSpec>> shardSpecs = Maps.newHashMap();
    // if we were given number of shards per interval and the intervals, we don't need to scan the data
    if (!determineNumPartitions && !determineIntervals) {
        log.info("numShards and intervals provided, skipping determine partition scan");
        final SortedSet<Interval> intervals = ingestionSchema.getDataSchema().getGranularitySpec().bucketIntervals().get();
        final int numShards = ingestionSchema.getTuningConfig().getNumShards();
        for (Interval interval : intervals) {
            final List<ShardSpec> intervalShardSpecs = Lists.newArrayListWithCapacity(numShards);
            if (numShards > 1) {
                for (int i = 0; i < numShards; i++) {
                    intervalShardSpecs.add(new HashBasedNumberedShardSpec(i, numShards, null, jsonMapper));
                }
            } else {
                intervalShardSpecs.add(NoneShardSpec.instance());
            }
            shardSpecs.put(interval, intervalShardSpecs);
        }
        return shardSpecs;
    }
    // determine intervals containing data and prime HLL collectors
    final Map<Interval, Optional<HyperLogLogCollector>> hllCollectors = Maps.newHashMap();
    int thrownAway = 0;
    log.info("Determining intervals and shardSpecs");
    long determineShardSpecsStartMillis = System.currentTimeMillis();
    try (final Firehose firehose = firehoseFactory.connect(ingestionSchema.getDataSchema().getParser())) {
        while (firehose.hasMore()) {
            final InputRow inputRow = firehose.nextRow();
            final Interval interval;
            if (determineIntervals) {
                interval = granularitySpec.getSegmentGranularity().bucket(inputRow.getTimestamp());
            } else {
                final Optional<Interval> optInterval = granularitySpec.bucketInterval(inputRow.getTimestamp());
                if (!optInterval.isPresent()) {
                    thrownAway++;
                    continue;
                }
                interval = optInterval.get();
            }
            if (!determineNumPartitions) {
                // for the interval and don't instantiate a HLL collector
                if (!hllCollectors.containsKey(interval)) {
                    hllCollectors.put(interval, Optional.<HyperLogLogCollector>absent());
                }
                continue;
            }
            if (!hllCollectors.containsKey(interval)) {
                hllCollectors.put(interval, Optional.of(HyperLogLogCollector.makeLatestCollector()));
            }
            List<Object> groupKey = Rows.toGroupKey(queryGranularity.bucketStart(inputRow.getTimestamp()).getMillis(), inputRow);
            hllCollectors.get(interval).get().add(hashFunction.hashBytes(jsonMapper.writeValueAsBytes(groupKey)).asBytes());
        }
    }
    if (thrownAway > 0) {
        log.warn("Unable to to find a matching interval for [%,d] events", thrownAway);
    }
    final ImmutableSortedMap<Interval, Optional<HyperLogLogCollector>> sortedMap = ImmutableSortedMap.copyOf(hllCollectors, Comparators.intervalsByStartThenEnd());
    for (final Map.Entry<Interval, Optional<HyperLogLogCollector>> entry : sortedMap.entrySet()) {
        final Interval interval = entry.getKey();
        final Optional<HyperLogLogCollector> collector = entry.getValue();
        final int numShards;
        if (determineNumPartitions) {
            final long numRows = new Double(collector.get().estimateCardinality()).longValue();
            numShards = (int) Math.ceil((double) numRows / ingestionSchema.getTuningConfig().getTargetPartitionSize());
            log.info("Estimated [%,d] rows of data for interval [%s], creating [%,d] shards", numRows, interval, numShards);
        } else {
            numShards = ingestionSchema.getTuningConfig().getNumShards();
            log.info("Creating [%,d] shards for interval [%s]", numShards, interval);
        }
        final List<ShardSpec> intervalShardSpecs = Lists.newArrayListWithCapacity(numShards);
        if (numShards > 1) {
            for (int i = 0; i < numShards; i++) {
                intervalShardSpecs.add(new HashBasedNumberedShardSpec(i, numShards, null, jsonMapper));
            }
        } else {
            intervalShardSpecs.add(NoneShardSpec.instance());
        }
        shardSpecs.put(interval, intervalShardSpecs);
    }
    log.info("Found intervals and shardSpecs in %,dms", System.currentTimeMillis() - determineShardSpecsStartMillis);
    return shardSpecs;
}
Also used : Granularity(io.druid.java.util.common.granularity.Granularity) NoneShardSpec(io.druid.timeline.partition.NoneShardSpec) ShardSpec(io.druid.timeline.partition.ShardSpec) NumberedShardSpec(io.druid.timeline.partition.NumberedShardSpec) HashBasedNumberedShardSpec(io.druid.timeline.partition.HashBasedNumberedShardSpec) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) HashBasedNumberedShardSpec(io.druid.timeline.partition.HashBasedNumberedShardSpec) Optional(com.google.common.base.Optional) Firehose(io.druid.data.input.Firehose) HyperLogLogCollector(io.druid.hll.HyperLogLogCollector) GranularitySpec(io.druid.segment.indexing.granularity.GranularitySpec) InputRow(io.druid.data.input.InputRow) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) ImmutableSortedMap(com.google.common.collect.ImmutableSortedMap) Interval(org.joda.time.Interval)

Example 3 with HashBasedNumberedShardSpec

use of io.druid.timeline.partition.HashBasedNumberedShardSpec in project druid by druid-io.

the class BatchDeltaIngestionTest method makeHadoopDruidIndexerConfig.

private HadoopDruidIndexerConfig makeHadoopDruidIndexerConfig(Map<String, Object> inputSpec, File tmpDir) throws Exception {
    HadoopDruidIndexerConfig config = new HadoopDruidIndexerConfig(new HadoopIngestionSpec(new DataSchema("website", MAPPER.convertValue(new StringInputRowParser(new CSVParseSpec(new TimestampSpec("timestamp", "yyyyMMddHH", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("host")), null, null), null, ImmutableList.of("timestamp", "host", "host2", "visited_num")), null), Map.class), new AggregatorFactory[] { new LongSumAggregatorFactory("visited_sum", "visited_num"), new HyperUniquesAggregatorFactory("unique_hosts", "host2") }, new UniformGranularitySpec(Granularities.DAY, Granularities.NONE, ImmutableList.of(INTERVAL_FULL)), MAPPER), new HadoopIOConfig(inputSpec, null, tmpDir.getCanonicalPath()), new HadoopTuningConfig(tmpDir.getCanonicalPath(), null, null, null, null, null, false, false, false, false, null, false, false, null, null, null, false, false)));
    config.setShardSpecs(ImmutableMap.<Long, List<HadoopyShardSpec>>of(INTERVAL_FULL.getStartMillis(), ImmutableList.of(new HadoopyShardSpec(new HashBasedNumberedShardSpec(0, 1, null, HadoopDruidIndexerConfig.JSON_MAPPER), 0))));
    config = HadoopDruidIndexerConfig.fromSpec(config.getSchema());
    return config;
}
Also used : HashBasedNumberedShardSpec(io.druid.timeline.partition.HashBasedNumberedShardSpec) LongSumAggregatorFactory(io.druid.query.aggregation.LongSumAggregatorFactory) DataSchema(io.druid.segment.indexing.DataSchema) UniformGranularitySpec(io.druid.segment.indexing.granularity.UniformGranularitySpec) CSVParseSpec(io.druid.data.input.impl.CSVParseSpec) StringInputRowParser(io.druid.data.input.impl.StringInputRowParser) TimestampSpec(io.druid.data.input.impl.TimestampSpec) HyperUniquesAggregatorFactory(io.druid.query.aggregation.hyperloglog.HyperUniquesAggregatorFactory) DimensionsSpec(io.druid.data.input.impl.DimensionsSpec) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap)

Example 4 with HashBasedNumberedShardSpec

use of io.druid.timeline.partition.HashBasedNumberedShardSpec in project druid by druid-io.

the class DetermineHashedPartitionsJob method run.

public boolean run() {
    try {
        /*
       * Group by (timestamp, dimensions) so we can correctly count dimension values as they would appear
       * in the final segment.
       */
        long startTime = System.currentTimeMillis();
        final Job groupByJob = Job.getInstance(new Configuration(), String.format("%s-determine_partitions_hashed-%s", config.getDataSource(), config.getIntervals()));
        JobHelper.injectSystemProperties(groupByJob);
        config.addJobProperties(groupByJob);
        groupByJob.setMapperClass(DetermineCardinalityMapper.class);
        groupByJob.setMapOutputKeyClass(LongWritable.class);
        groupByJob.setMapOutputValueClass(BytesWritable.class);
        groupByJob.setReducerClass(DetermineCardinalityReducer.class);
        groupByJob.setOutputKeyClass(NullWritable.class);
        groupByJob.setOutputValueClass(NullWritable.class);
        groupByJob.setOutputFormatClass(SequenceFileOutputFormat.class);
        groupByJob.setPartitionerClass(DetermineHashedPartitionsPartitioner.class);
        if (!config.getSegmentGranularIntervals().isPresent()) {
            groupByJob.setNumReduceTasks(1);
        } else {
            groupByJob.setNumReduceTasks(config.getSegmentGranularIntervals().get().size());
        }
        JobHelper.setupClasspath(JobHelper.distributedClassPath(config.getWorkingPath()), JobHelper.distributedClassPath(config.makeIntermediatePath()), groupByJob);
        config.addInputPaths(groupByJob);
        config.intoConfiguration(groupByJob);
        FileOutputFormat.setOutputPath(groupByJob, config.makeGroupedDataDir());
        groupByJob.submit();
        log.info("Job %s submitted, status available at: %s", groupByJob.getJobName(), groupByJob.getTrackingURL());
        if (!groupByJob.waitForCompletion(true)) {
            log.error("Job failed: %s", groupByJob.getJobID());
            return false;
        }
        /*
       * Load partitions and intervals determined by the previous job.
       */
        log.info("Job completed, loading up partitions for intervals[%s].", config.getSegmentGranularIntervals());
        FileSystem fileSystem = null;
        if (!config.getSegmentGranularIntervals().isPresent()) {
            final Path intervalInfoPath = config.makeIntervalInfoPath();
            fileSystem = intervalInfoPath.getFileSystem(groupByJob.getConfiguration());
            if (!Utils.exists(groupByJob, fileSystem, intervalInfoPath)) {
                throw new ISE("Path[%s] didn't exist!?", intervalInfoPath);
            }
            List<Interval> intervals = config.JSON_MAPPER.readValue(Utils.openInputStream(groupByJob, intervalInfoPath), new TypeReference<List<Interval>>() {
            });
            config.setGranularitySpec(new UniformGranularitySpec(config.getGranularitySpec().getSegmentGranularity(), config.getGranularitySpec().getQueryGranularity(), config.getGranularitySpec().isRollup(), intervals));
            log.info("Determined Intervals for Job [%s].", config.getSegmentGranularIntervals());
        }
        Map<Long, List<HadoopyShardSpec>> shardSpecs = Maps.newTreeMap(DateTimeComparator.getInstance());
        int shardCount = 0;
        for (Interval segmentGranularity : config.getSegmentGranularIntervals().get()) {
            DateTime bucket = segmentGranularity.getStart();
            final Path partitionInfoPath = config.makeSegmentPartitionInfoPath(segmentGranularity);
            if (fileSystem == null) {
                fileSystem = partitionInfoPath.getFileSystem(groupByJob.getConfiguration());
            }
            if (Utils.exists(groupByJob, fileSystem, partitionInfoPath)) {
                final Long numRows = config.JSON_MAPPER.readValue(Utils.openInputStream(groupByJob, partitionInfoPath), new TypeReference<Long>() {
                });
                log.info("Found approximately [%,d] rows in data.", numRows);
                final int numberOfShards = (int) Math.ceil((double) numRows / config.getTargetPartitionSize());
                log.info("Creating [%,d] shards", numberOfShards);
                List<HadoopyShardSpec> actualSpecs = Lists.newArrayListWithExpectedSize(numberOfShards);
                if (numberOfShards == 1) {
                    actualSpecs.add(new HadoopyShardSpec(NoneShardSpec.instance(), shardCount++));
                } else {
                    for (int i = 0; i < numberOfShards; ++i) {
                        actualSpecs.add(new HadoopyShardSpec(new HashBasedNumberedShardSpec(i, numberOfShards, null, HadoopDruidIndexerConfig.JSON_MAPPER), shardCount++));
                        log.info("DateTime[%s], partition[%d], spec[%s]", bucket, i, actualSpecs.get(i));
                    }
                }
                shardSpecs.put(bucket.getMillis(), actualSpecs);
            } else {
                log.info("Path[%s] didn't exist!?", partitionInfoPath);
            }
        }
        config.setShardSpecs(shardSpecs);
        log.info("DetermineHashedPartitionsJob took %d millis", (System.currentTimeMillis() - startTime));
        return true;
    } catch (Exception e) {
        throw Throwables.propagate(e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) HashBasedNumberedShardSpec(io.druid.timeline.partition.HashBasedNumberedShardSpec) Configuration(org.apache.hadoop.conf.Configuration) DateTime(org.joda.time.DateTime) IOException(java.io.IOException) UniformGranularitySpec(io.druid.segment.indexing.granularity.UniformGranularitySpec) FileSystem(org.apache.hadoop.fs.FileSystem) ISE(io.druid.java.util.common.ISE) List(java.util.List) Job(org.apache.hadoop.mapreduce.Job) Interval(org.joda.time.Interval)

Example 5 with HashBasedNumberedShardSpec

use of io.druid.timeline.partition.HashBasedNumberedShardSpec in project druid by druid-io.

the class HadoopDruidDetermineConfigurationJob method run.

@Override
public boolean run() {
    List<Jobby> jobs = Lists.newArrayList();
    JobHelper.ensurePaths(config);
    if (config.isDeterminingPartitions()) {
        jobs.add(config.getPartitionsSpec().getPartitionJob(config));
    } else {
        int shardsPerInterval = config.getPartitionsSpec().getNumShards();
        Map<Long, List<HadoopyShardSpec>> shardSpecs = Maps.newTreeMap();
        int shardCount = 0;
        for (Interval segmentGranularity : config.getSegmentGranularIntervals().get()) {
            DateTime bucket = segmentGranularity.getStart();
            if (shardsPerInterval > 0) {
                List<HadoopyShardSpec> specs = Lists.newArrayListWithCapacity(shardsPerInterval);
                for (int i = 0; i < shardsPerInterval; i++) {
                    specs.add(new HadoopyShardSpec(new HashBasedNumberedShardSpec(i, shardsPerInterval, config.getPartitionsSpec().getPartitionDimensions(), HadoopDruidIndexerConfig.JSON_MAPPER), shardCount++));
                }
                shardSpecs.put(bucket.getMillis(), specs);
                log.info("DateTime[%s], spec[%s]", bucket, specs);
            } else {
                final HadoopyShardSpec spec = new HadoopyShardSpec(NoneShardSpec.instance(), shardCount++);
                shardSpecs.put(bucket.getMillis(), Lists.newArrayList(spec));
                log.info("DateTime[%s], spec[%s]", bucket, spec);
            }
        }
        config.setShardSpecs(shardSpecs);
    }
    return JobHelper.runJobs(jobs, config);
}
Also used : HashBasedNumberedShardSpec(io.druid.timeline.partition.HashBasedNumberedShardSpec) List(java.util.List) DateTime(org.joda.time.DateTime) Interval(org.joda.time.Interval)

Aggregations

HashBasedNumberedShardSpec (io.druid.timeline.partition.HashBasedNumberedShardSpec)9 Interval (org.joda.time.Interval)7 DateTime (org.joda.time.DateTime)5 List (java.util.List)4 UniformGranularitySpec (io.druid.segment.indexing.granularity.UniformGranularitySpec)3 DataSegment (io.druid.timeline.DataSegment)3 File (java.io.File)3 ImmutableList (com.google.common.collect.ImmutableList)2 ImmutableMap (com.google.common.collect.ImmutableMap)2 Firehose (io.druid.data.input.Firehose)2 InputRow (io.druid.data.input.InputRow)2 QueryableIndex (io.druid.segment.QueryableIndex)2 DataSchema (io.druid.segment.indexing.DataSchema)2 NumberedShardSpec (io.druid.timeline.partition.NumberedShardSpec)2 ShardSpec (io.druid.timeline.partition.ShardSpec)2 Map (java.util.Map)2 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)1 Optional (com.google.common.base.Optional)1 ImmutableSortedMap (com.google.common.collect.ImmutableSortedMap)1 MapBasedInputRow (io.druid.data.input.MapBasedInputRow)1