Search in sources :

Example 16 with HashedPartitionsSpec

use of org.apache.druid.indexer.partitions.HashedPartitionsSpec in project druid by druid-io.

the class ITAppendBatchIndexTest method submitIngestionTaskAndVerify.

private void submitIngestionTaskAndVerify(String indexDatasource, PartitionsSpec partitionsSpec, boolean appendToExisting, Pair<Boolean, Boolean> segmentAvailabilityConfirmationPair) throws Exception {
    InputFormatDetails inputFormatDetails = InputFormatDetails.JSON;
    Map inputFormatMap = new ImmutableMap.Builder<String, Object>().put("type", inputFormatDetails.getInputFormatType()).build();
    final Function<String, String> sqlInputSourcePropsTransform = spec -> {
        try {
            spec = StringUtils.replace(spec, "%%PARTITIONS_SPEC%%", jsonMapper.writeValueAsString(partitionsSpec));
            spec = StringUtils.replace(spec, "%%INPUT_SOURCE_FILTER%%", "*" + inputFormatDetails.getFileExtension());
            spec = StringUtils.replace(spec, "%%INPUT_SOURCE_BASE_DIR%%", "/resources/data/batch_index" + inputFormatDetails.getFolderSuffix());
            spec = StringUtils.replace(spec, "%%INPUT_FORMAT%%", jsonMapper.writeValueAsString(inputFormatMap));
            spec = StringUtils.replace(spec, "%%APPEND_TO_EXISTING%%", jsonMapper.writeValueAsString(appendToExisting));
            spec = StringUtils.replace(spec, "%%DROP_EXISTING%%", jsonMapper.writeValueAsString(false));
            if (partitionsSpec instanceof DynamicPartitionsSpec) {
                spec = StringUtils.replace(spec, "%%FORCE_GUARANTEED_ROLLUP%%", jsonMapper.writeValueAsString(false));
            } else if (partitionsSpec instanceof HashedPartitionsSpec || partitionsSpec instanceof SingleDimensionPartitionsSpec) {
                spec = StringUtils.replace(spec, "%%FORCE_GUARANTEED_ROLLUP%%", jsonMapper.writeValueAsString(true));
            }
            return spec;
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    };
    doIndexTest(indexDatasource, INDEX_TASK, sqlInputSourcePropsTransform, null, false, false, true, segmentAvailabilityConfirmationPair);
}
Also used : Logger(org.apache.druid.java.util.common.logger.Logger) DataProvider(org.testng.annotations.DataProvider) ImmutableMap(com.google.common.collect.ImmutableMap) StringUtils(org.apache.druid.java.util.common.StringUtils) DruidTestModuleFactory(org.apache.druid.testing.guice.DruidTestModuleFactory) HashedPartitionsSpec(org.apache.druid.indexer.partitions.HashedPartitionsSpec) Test(org.testng.annotations.Test) UUID(java.util.UUID) Function(java.util.function.Function) Guice(org.testng.annotations.Guice) Pair(org.apache.druid.java.util.common.Pair) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) SingleDimensionPartitionsSpec(org.apache.druid.indexer.partitions.SingleDimensionPartitionsSpec) TestNGGroup(org.apache.druid.tests.TestNGGroup) Closeable(java.io.Closeable) Map(java.util.Map) DynamicPartitionsSpec(org.apache.druid.indexer.partitions.DynamicPartitionsSpec) PartitionsSpec(org.apache.druid.indexer.partitions.PartitionsSpec) HashedPartitionsSpec(org.apache.druid.indexer.partitions.HashedPartitionsSpec) DynamicPartitionsSpec(org.apache.druid.indexer.partitions.DynamicPartitionsSpec) SingleDimensionPartitionsSpec(org.apache.druid.indexer.partitions.SingleDimensionPartitionsSpec) ImmutableMap(com.google.common.collect.ImmutableMap) Map(java.util.Map)

Example 17 with HashedPartitionsSpec

use of org.apache.druid.indexer.partitions.HashedPartitionsSpec in project druid by druid-io.

the class DataSegmentTest method testDeserializationDataSegmentLastCompactionStateWithNullSpecs.

@Test
public void testDeserializationDataSegmentLastCompactionStateWithNullSpecs() throws Exception {
    final Interval interval = Intervals.of("2011-10-01/2011-10-02");
    final ImmutableMap<String, Object> loadSpec = ImmutableMap.of("something", "or_other");
    DataSegment segment = new DataSegment("something", interval, "1", loadSpec, Arrays.asList("dim1", "dim2"), Arrays.asList("met1", "met2"), new NumberedShardSpec(3, 0), new CompactionState(new HashedPartitionsSpec(100000, null, ImmutableList.of("dim1")), null, null, null, ImmutableMap.of(), ImmutableMap.of()), TEST_VERSION, 1);
    // lastCompactionState has null specs for dimensionSpec and transformSpec
    String lastCompactionStateWithNullSpecs = "{" + "\"dataSource\": \"something\"," + "\"interval\": \"2011-10-01T00:00:00.000Z/2011-10-02T00:00:00.000Z\"," + "\"version\": \"1\"," + "\"loadSpec\": {" + " \"something\": \"or_other\"" + "}," + "\"dimensions\": \"dim1,dim2\"," + "\"metrics\": \"met1,met2\"," + "\"shardSpec\": {" + " \"type\": \"numbered\"," + " \"partitionNum\": 3," + " \"partitions\": 0" + "}," + "\"lastCompactionState\": {" + " \"partitionsSpec\": {" + "  \"type\": \"hashed\"," + "  \"numShards\": null," + "  \"partitionDimensions\": [\"dim1\"]," + "  \"partitionFunction\": \"murmur3_32_abs\"," + "  \"maxRowsPerSegment\": 100000" + " }," + " \"indexSpec\": {}," + " \"granularitySpec\": {}" + "}," + "\"binaryVersion\": 9," + "\"size\": 1," + "\"identifier\": \"something_2011-10-01T00:00:00.000Z_2011-10-02T00:00:00.000Z_1_3\"" + "}";
    final Map<String, Object> objectMap = MAPPER.readValue(lastCompactionStateWithNullSpecs, JacksonUtils.TYPE_REFERENCE_MAP_STRING_OBJECT);
    Assert.assertEquals(11, objectMap.size());
    Assert.assertEquals("something", objectMap.get("dataSource"));
    Assert.assertEquals(interval.toString(), objectMap.get("interval"));
    Assert.assertEquals("1", objectMap.get("version"));
    Assert.assertEquals(loadSpec, objectMap.get("loadSpec"));
    Assert.assertEquals("dim1,dim2", objectMap.get("dimensions"));
    Assert.assertEquals("met1,met2", objectMap.get("metrics"));
    Assert.assertEquals(ImmutableMap.of("type", "numbered", "partitionNum", 3, "partitions", 0), objectMap.get("shardSpec"));
    Assert.assertEquals(TEST_VERSION, objectMap.get("binaryVersion"));
    Assert.assertEquals(1, objectMap.get("size"));
    Assert.assertEquals(3, ((Map) objectMap.get("lastCompactionState")).size());
    DataSegment deserializedSegment = MAPPER.readValue(lastCompactionStateWithNullSpecs, DataSegment.class);
    Assert.assertEquals(segment.getDataSource(), deserializedSegment.getDataSource());
    Assert.assertEquals(segment.getInterval(), deserializedSegment.getInterval());
    Assert.assertEquals(segment.getVersion(), deserializedSegment.getVersion());
    Assert.assertEquals(segment.getLoadSpec(), deserializedSegment.getLoadSpec());
    Assert.assertEquals(segment.getDimensions(), deserializedSegment.getDimensions());
    Assert.assertEquals(segment.getMetrics(), deserializedSegment.getMetrics());
    Assert.assertEquals(segment.getShardSpec(), deserializedSegment.getShardSpec());
    Assert.assertEquals(segment.getSize(), deserializedSegment.getSize());
    Assert.assertEquals(segment.getId(), deserializedSegment.getId());
    Assert.assertEquals(segment.getLastCompactionState(), deserializedSegment.getLastCompactionState());
    Assert.assertNotNull(segment.getLastCompactionState());
    Assert.assertNull(segment.getLastCompactionState().getDimensionsSpec());
    Assert.assertNull(segment.getLastCompactionState().getTransformSpec());
    Assert.assertNull(segment.getLastCompactionState().getMetricsSpec());
    Assert.assertNotNull(deserializedSegment.getLastCompactionState());
    Assert.assertNull(deserializedSegment.getLastCompactionState().getDimensionsSpec());
    deserializedSegment = MAPPER.readValue(lastCompactionStateWithNullSpecs, DataSegment.class);
    Assert.assertEquals(0, segment.compareTo(deserializedSegment));
    deserializedSegment = MAPPER.readValue(lastCompactionStateWithNullSpecs, DataSegment.class);
    Assert.assertEquals(0, deserializedSegment.compareTo(segment));
    deserializedSegment = MAPPER.readValue(lastCompactionStateWithNullSpecs, DataSegment.class);
    Assert.assertEquals(segment.hashCode(), deserializedSegment.hashCode());
}
Also used : HashedPartitionsSpec(org.apache.druid.indexer.partitions.HashedPartitionsSpec) NumberedShardSpec(org.apache.druid.timeline.partition.NumberedShardSpec) Interval(org.joda.time.Interval) Test(org.junit.Test)

Example 18 with HashedPartitionsSpec

use of org.apache.druid.indexer.partitions.HashedPartitionsSpec in project druid by druid-io.

the class DataSegmentTest method testV1Serialization.

@Test
public void testV1Serialization() throws Exception {
    final Interval interval = Intervals.of("2011-10-01/2011-10-02");
    final ImmutableMap<String, Object> loadSpec = ImmutableMap.of("something", "or_other");
    DataSegment segment = new DataSegment("something", interval, "1", loadSpec, Arrays.asList("dim1", "dim2"), Arrays.asList("met1", "met2"), new NumberedShardSpec(3, 0), new CompactionState(new HashedPartitionsSpec(100000, null, ImmutableList.of("dim1")), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("dim1", "bar", "foo"))), ImmutableList.of(ImmutableMap.of("type", "count", "name", "count")), ImmutableMap.of("filter", ImmutableMap.of("type", "selector", "dimension", "dim1", "value", "foo")), ImmutableMap.of(), ImmutableMap.of()), TEST_VERSION, 1);
    final Map<String, Object> objectMap = MAPPER.readValue(MAPPER.writeValueAsString(segment), JacksonUtils.TYPE_REFERENCE_MAP_STRING_OBJECT);
    Assert.assertEquals(11, objectMap.size());
    Assert.assertEquals("something", objectMap.get("dataSource"));
    Assert.assertEquals(interval.toString(), objectMap.get("interval"));
    Assert.assertEquals("1", objectMap.get("version"));
    Assert.assertEquals(loadSpec, objectMap.get("loadSpec"));
    Assert.assertEquals("dim1,dim2", objectMap.get("dimensions"));
    Assert.assertEquals("met1,met2", objectMap.get("metrics"));
    Assert.assertEquals(ImmutableMap.of("type", "numbered", "partitionNum", 3, "partitions", 0), objectMap.get("shardSpec"));
    Assert.assertEquals(TEST_VERSION, objectMap.get("binaryVersion"));
    Assert.assertEquals(1, objectMap.get("size"));
    Assert.assertEquals(6, ((Map) objectMap.get("lastCompactionState")).size());
    DataSegment deserializedSegment = MAPPER.readValue(MAPPER.writeValueAsString(segment), DataSegment.class);
    Assert.assertEquals(segment.getDataSource(), deserializedSegment.getDataSource());
    Assert.assertEquals(segment.getInterval(), deserializedSegment.getInterval());
    Assert.assertEquals(segment.getVersion(), deserializedSegment.getVersion());
    Assert.assertEquals(segment.getLoadSpec(), deserializedSegment.getLoadSpec());
    Assert.assertEquals(segment.getDimensions(), deserializedSegment.getDimensions());
    Assert.assertEquals(segment.getMetrics(), deserializedSegment.getMetrics());
    Assert.assertEquals(segment.getShardSpec(), deserializedSegment.getShardSpec());
    Assert.assertEquals(segment.getSize(), deserializedSegment.getSize());
    Assert.assertEquals(segment.getId(), deserializedSegment.getId());
    Assert.assertEquals(segment.getLastCompactionState(), deserializedSegment.getLastCompactionState());
    deserializedSegment = MAPPER.readValue(MAPPER.writeValueAsString(segment), DataSegment.class);
    Assert.assertEquals(0, segment.compareTo(deserializedSegment));
    deserializedSegment = MAPPER.readValue(MAPPER.writeValueAsString(segment), DataSegment.class);
    Assert.assertEquals(0, deserializedSegment.compareTo(segment));
    deserializedSegment = MAPPER.readValue(MAPPER.writeValueAsString(segment), DataSegment.class);
    Assert.assertEquals(segment.hashCode(), deserializedSegment.hashCode());
}
Also used : HashedPartitionsSpec(org.apache.druid.indexer.partitions.HashedPartitionsSpec) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) NumberedShardSpec(org.apache.druid.timeline.partition.NumberedShardSpec) Interval(org.joda.time.Interval) Test(org.junit.Test)

Example 19 with HashedPartitionsSpec

use of org.apache.druid.indexer.partitions.HashedPartitionsSpec in project druid by druid-io.

the class IndexTask method createShardSpecsFromInput.

private PartitionAnalysis createShardSpecsFromInput(ObjectMapper jsonMapper, IndexIngestionSpec ingestionSchema, InputSource inputSource, File tmpDir, GranularitySpec granularitySpec, @Nonnull PartitionsSpec partitionsSpec, boolean determineIntervals) throws IOException {
    assert partitionsSpec.getType() != SecondaryPartitionType.RANGE;
    long determineShardSpecsStartMillis = System.currentTimeMillis();
    final Map<Interval, Optional<HyperLogLogCollector>> hllCollectors = collectIntervalsAndShardSpecs(jsonMapper, ingestionSchema, inputSource, tmpDir, granularitySpec, partitionsSpec, determineIntervals);
    final PartitionAnalysis<Integer, ?> partitionAnalysis;
    if (partitionsSpec.getType() == SecondaryPartitionType.LINEAR) {
        partitionAnalysis = new LinearPartitionAnalysis((DynamicPartitionsSpec) partitionsSpec);
    } else if (partitionsSpec.getType() == SecondaryPartitionType.HASH) {
        partitionAnalysis = new HashPartitionAnalysis((HashedPartitionsSpec) partitionsSpec);
    } else {
        throw new UOE("%s", partitionsSpec.getClass().getName());
    }
    for (final Map.Entry<Interval, Optional<HyperLogLogCollector>> entry : hllCollectors.entrySet()) {
        final Interval interval = entry.getKey();
        final int numBucketsPerInterval;
        if (partitionsSpec.getType() == SecondaryPartitionType.HASH) {
            final HashedPartitionsSpec hashedPartitionsSpec = (HashedPartitionsSpec) partitionsSpec;
            final HyperLogLogCollector collector = entry.getValue().orNull();
            if (partitionsSpec.needsDeterminePartitions(false)) {
                final long numRows = Preconditions.checkNotNull(collector, "HLL collector").estimateCardinalityRound();
                final int nonNullMaxRowsPerSegment = partitionsSpec.getMaxRowsPerSegment() == null ? PartitionsSpec.DEFAULT_MAX_ROWS_PER_SEGMENT : partitionsSpec.getMaxRowsPerSegment();
                numBucketsPerInterval = (int) Math.ceil((double) numRows / nonNullMaxRowsPerSegment);
                log.info("Estimated [%,d] rows of data for interval [%s], creating [%,d] shards", numRows, interval, numBucketsPerInterval);
            } else {
                numBucketsPerInterval = hashedPartitionsSpec.getNumShards() == null ? 1 : hashedPartitionsSpec.getNumShards();
                log.info("Creating [%,d] buckets for interval [%s]", numBucketsPerInterval, interval);
            }
        } else {
            numBucketsPerInterval = 1;
        }
        partitionAnalysis.updateBucket(interval, numBucketsPerInterval);
    }
    log.info("Found intervals and shardSpecs in %,dms", System.currentTimeMillis() - determineShardSpecsStartMillis);
    return partitionAnalysis;
}
Also used : HashedPartitionsSpec(org.apache.druid.indexer.partitions.HashedPartitionsSpec) Optional(com.google.common.base.Optional) HyperLogLogCollector(org.apache.druid.hll.HyperLogLogCollector) UOE(org.apache.druid.java.util.common.UOE) LinearPartitionAnalysis(org.apache.druid.indexing.common.task.batch.partition.LinearPartitionAnalysis) HashPartitionAnalysis(org.apache.druid.indexing.common.task.batch.partition.HashPartitionAnalysis) DynamicPartitionsSpec(org.apache.druid.indexer.partitions.DynamicPartitionsSpec) Map(java.util.Map) TreeMap(java.util.TreeMap) HashMap(java.util.HashMap) Interval(org.joda.time.Interval)

Example 20 with HashedPartitionsSpec

use of org.apache.druid.indexer.partitions.HashedPartitionsSpec in project druid by druid-io.

the class PartialDimensionCardinalityTask method runTask.

@Override
public TaskStatus runTask(TaskToolbox toolbox) throws Exception {
    DataSchema dataSchema = ingestionSchema.getDataSchema();
    GranularitySpec granularitySpec = dataSchema.getGranularitySpec();
    ParallelIndexTuningConfig tuningConfig = ingestionSchema.getTuningConfig();
    HashedPartitionsSpec partitionsSpec = (HashedPartitionsSpec) tuningConfig.getPartitionsSpec();
    Preconditions.checkNotNull(partitionsSpec, "partitionsSpec required in tuningConfig");
    InputSource inputSource = ingestionSchema.getIOConfig().getNonNullInputSource(ingestionSchema.getDataSchema().getParser());
    InputFormat inputFormat = inputSource.needsFormat() ? ParallelIndexSupervisorTask.getInputFormat(ingestionSchema) : null;
    final RowIngestionMeters buildSegmentsMeters = toolbox.getRowIngestionMetersFactory().createRowIngestionMeters();
    final ParseExceptionHandler parseExceptionHandler = new ParseExceptionHandler(buildSegmentsMeters, tuningConfig.isLogParseExceptions(), tuningConfig.getMaxParseExceptions(), tuningConfig.getMaxSavedParseExceptions());
    final boolean determineIntervals = granularitySpec.inputIntervals().isEmpty();
    try (final CloseableIterator<InputRow> inputRowIterator = AbstractBatchIndexTask.inputSourceReader(toolbox.getIndexingTmpDir(), dataSchema, inputSource, inputFormat, determineIntervals ? Objects::nonNull : AbstractBatchIndexTask.defaultRowFilter(granularitySpec), buildSegmentsMeters, parseExceptionHandler)) {
        Map<Interval, byte[]> cardinalities = determineCardinalities(inputRowIterator, granularitySpec);
        sendReport(toolbox, new DimensionCardinalityReport(getId(), cardinalities));
    }
    return TaskStatus.success(getId());
}
Also used : HashedPartitionsSpec(org.apache.druid.indexer.partitions.HashedPartitionsSpec) InputSource(org.apache.druid.data.input.InputSource) DataSchema(org.apache.druid.segment.indexing.DataSchema) GranularitySpec(org.apache.druid.segment.indexing.granularity.GranularitySpec) InputFormat(org.apache.druid.data.input.InputFormat) ParseExceptionHandler(org.apache.druid.segment.incremental.ParseExceptionHandler) InputRow(org.apache.druid.data.input.InputRow) RowIngestionMeters(org.apache.druid.segment.incremental.RowIngestionMeters) Interval(org.joda.time.Interval)

Aggregations

HashedPartitionsSpec (org.apache.druid.indexer.partitions.HashedPartitionsSpec)43 Test (org.junit.Test)31 Interval (org.joda.time.Interval)20 DataSegment (org.apache.druid.timeline.DataSegment)15 List (java.util.List)14 ImmutableList (com.google.common.collect.ImmutableList)12 PartitionsSpec (org.apache.druid.indexer.partitions.PartitionsSpec)12 Map (java.util.Map)11 SingleDimensionPartitionsSpec (org.apache.druid.indexer.partitions.SingleDimensionPartitionsSpec)11 HashBasedNumberedShardSpec (org.apache.druid.timeline.partition.HashBasedNumberedShardSpec)11 ArrayList (java.util.ArrayList)10 DimensionsSpec (org.apache.druid.data.input.impl.DimensionsSpec)9 StringUtils (org.apache.druid.java.util.common.StringUtils)9 File (java.io.File)8 HashMap (java.util.HashMap)8 DynamicPartitionsSpec (org.apache.druid.indexer.partitions.DynamicPartitionsSpec)8 UniformGranularitySpec (org.apache.druid.segment.indexing.granularity.UniformGranularitySpec)8 HashPartitionFunction (org.apache.druid.timeline.partition.HashPartitionFunction)8 ImmutableMap (com.google.common.collect.ImmutableMap)7 IOException (java.io.IOException)7