use of org.apache.druid.indexer.partitions.HashedPartitionsSpec in project druid by druid-io.
the class ITAppendBatchIndexTest method submitIngestionTaskAndVerify.
private void submitIngestionTaskAndVerify(String indexDatasource, PartitionsSpec partitionsSpec, boolean appendToExisting, Pair<Boolean, Boolean> segmentAvailabilityConfirmationPair) throws Exception {
InputFormatDetails inputFormatDetails = InputFormatDetails.JSON;
Map inputFormatMap = new ImmutableMap.Builder<String, Object>().put("type", inputFormatDetails.getInputFormatType()).build();
final Function<String, String> sqlInputSourcePropsTransform = spec -> {
try {
spec = StringUtils.replace(spec, "%%PARTITIONS_SPEC%%", jsonMapper.writeValueAsString(partitionsSpec));
spec = StringUtils.replace(spec, "%%INPUT_SOURCE_FILTER%%", "*" + inputFormatDetails.getFileExtension());
spec = StringUtils.replace(spec, "%%INPUT_SOURCE_BASE_DIR%%", "/resources/data/batch_index" + inputFormatDetails.getFolderSuffix());
spec = StringUtils.replace(spec, "%%INPUT_FORMAT%%", jsonMapper.writeValueAsString(inputFormatMap));
spec = StringUtils.replace(spec, "%%APPEND_TO_EXISTING%%", jsonMapper.writeValueAsString(appendToExisting));
spec = StringUtils.replace(spec, "%%DROP_EXISTING%%", jsonMapper.writeValueAsString(false));
if (partitionsSpec instanceof DynamicPartitionsSpec) {
spec = StringUtils.replace(spec, "%%FORCE_GUARANTEED_ROLLUP%%", jsonMapper.writeValueAsString(false));
} else if (partitionsSpec instanceof HashedPartitionsSpec || partitionsSpec instanceof SingleDimensionPartitionsSpec) {
spec = StringUtils.replace(spec, "%%FORCE_GUARANTEED_ROLLUP%%", jsonMapper.writeValueAsString(true));
}
return spec;
} catch (Exception e) {
throw new RuntimeException(e);
}
};
doIndexTest(indexDatasource, INDEX_TASK, sqlInputSourcePropsTransform, null, false, false, true, segmentAvailabilityConfirmationPair);
}
use of org.apache.druid.indexer.partitions.HashedPartitionsSpec in project druid by druid-io.
the class DataSegmentTest method testDeserializationDataSegmentLastCompactionStateWithNullSpecs.
@Test
public void testDeserializationDataSegmentLastCompactionStateWithNullSpecs() throws Exception {
final Interval interval = Intervals.of("2011-10-01/2011-10-02");
final ImmutableMap<String, Object> loadSpec = ImmutableMap.of("something", "or_other");
DataSegment segment = new DataSegment("something", interval, "1", loadSpec, Arrays.asList("dim1", "dim2"), Arrays.asList("met1", "met2"), new NumberedShardSpec(3, 0), new CompactionState(new HashedPartitionsSpec(100000, null, ImmutableList.of("dim1")), null, null, null, ImmutableMap.of(), ImmutableMap.of()), TEST_VERSION, 1);
// lastCompactionState has null specs for dimensionSpec and transformSpec
String lastCompactionStateWithNullSpecs = "{" + "\"dataSource\": \"something\"," + "\"interval\": \"2011-10-01T00:00:00.000Z/2011-10-02T00:00:00.000Z\"," + "\"version\": \"1\"," + "\"loadSpec\": {" + " \"something\": \"or_other\"" + "}," + "\"dimensions\": \"dim1,dim2\"," + "\"metrics\": \"met1,met2\"," + "\"shardSpec\": {" + " \"type\": \"numbered\"," + " \"partitionNum\": 3," + " \"partitions\": 0" + "}," + "\"lastCompactionState\": {" + " \"partitionsSpec\": {" + " \"type\": \"hashed\"," + " \"numShards\": null," + " \"partitionDimensions\": [\"dim1\"]," + " \"partitionFunction\": \"murmur3_32_abs\"," + " \"maxRowsPerSegment\": 100000" + " }," + " \"indexSpec\": {}," + " \"granularitySpec\": {}" + "}," + "\"binaryVersion\": 9," + "\"size\": 1," + "\"identifier\": \"something_2011-10-01T00:00:00.000Z_2011-10-02T00:00:00.000Z_1_3\"" + "}";
final Map<String, Object> objectMap = MAPPER.readValue(lastCompactionStateWithNullSpecs, JacksonUtils.TYPE_REFERENCE_MAP_STRING_OBJECT);
Assert.assertEquals(11, objectMap.size());
Assert.assertEquals("something", objectMap.get("dataSource"));
Assert.assertEquals(interval.toString(), objectMap.get("interval"));
Assert.assertEquals("1", objectMap.get("version"));
Assert.assertEquals(loadSpec, objectMap.get("loadSpec"));
Assert.assertEquals("dim1,dim2", objectMap.get("dimensions"));
Assert.assertEquals("met1,met2", objectMap.get("metrics"));
Assert.assertEquals(ImmutableMap.of("type", "numbered", "partitionNum", 3, "partitions", 0), objectMap.get("shardSpec"));
Assert.assertEquals(TEST_VERSION, objectMap.get("binaryVersion"));
Assert.assertEquals(1, objectMap.get("size"));
Assert.assertEquals(3, ((Map) objectMap.get("lastCompactionState")).size());
DataSegment deserializedSegment = MAPPER.readValue(lastCompactionStateWithNullSpecs, DataSegment.class);
Assert.assertEquals(segment.getDataSource(), deserializedSegment.getDataSource());
Assert.assertEquals(segment.getInterval(), deserializedSegment.getInterval());
Assert.assertEquals(segment.getVersion(), deserializedSegment.getVersion());
Assert.assertEquals(segment.getLoadSpec(), deserializedSegment.getLoadSpec());
Assert.assertEquals(segment.getDimensions(), deserializedSegment.getDimensions());
Assert.assertEquals(segment.getMetrics(), deserializedSegment.getMetrics());
Assert.assertEquals(segment.getShardSpec(), deserializedSegment.getShardSpec());
Assert.assertEquals(segment.getSize(), deserializedSegment.getSize());
Assert.assertEquals(segment.getId(), deserializedSegment.getId());
Assert.assertEquals(segment.getLastCompactionState(), deserializedSegment.getLastCompactionState());
Assert.assertNotNull(segment.getLastCompactionState());
Assert.assertNull(segment.getLastCompactionState().getDimensionsSpec());
Assert.assertNull(segment.getLastCompactionState().getTransformSpec());
Assert.assertNull(segment.getLastCompactionState().getMetricsSpec());
Assert.assertNotNull(deserializedSegment.getLastCompactionState());
Assert.assertNull(deserializedSegment.getLastCompactionState().getDimensionsSpec());
deserializedSegment = MAPPER.readValue(lastCompactionStateWithNullSpecs, DataSegment.class);
Assert.assertEquals(0, segment.compareTo(deserializedSegment));
deserializedSegment = MAPPER.readValue(lastCompactionStateWithNullSpecs, DataSegment.class);
Assert.assertEquals(0, deserializedSegment.compareTo(segment));
deserializedSegment = MAPPER.readValue(lastCompactionStateWithNullSpecs, DataSegment.class);
Assert.assertEquals(segment.hashCode(), deserializedSegment.hashCode());
}
use of org.apache.druid.indexer.partitions.HashedPartitionsSpec in project druid by druid-io.
the class DataSegmentTest method testV1Serialization.
@Test
public void testV1Serialization() throws Exception {
final Interval interval = Intervals.of("2011-10-01/2011-10-02");
final ImmutableMap<String, Object> loadSpec = ImmutableMap.of("something", "or_other");
DataSegment segment = new DataSegment("something", interval, "1", loadSpec, Arrays.asList("dim1", "dim2"), Arrays.asList("met1", "met2"), new NumberedShardSpec(3, 0), new CompactionState(new HashedPartitionsSpec(100000, null, ImmutableList.of("dim1")), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("dim1", "bar", "foo"))), ImmutableList.of(ImmutableMap.of("type", "count", "name", "count")), ImmutableMap.of("filter", ImmutableMap.of("type", "selector", "dimension", "dim1", "value", "foo")), ImmutableMap.of(), ImmutableMap.of()), TEST_VERSION, 1);
final Map<String, Object> objectMap = MAPPER.readValue(MAPPER.writeValueAsString(segment), JacksonUtils.TYPE_REFERENCE_MAP_STRING_OBJECT);
Assert.assertEquals(11, objectMap.size());
Assert.assertEquals("something", objectMap.get("dataSource"));
Assert.assertEquals(interval.toString(), objectMap.get("interval"));
Assert.assertEquals("1", objectMap.get("version"));
Assert.assertEquals(loadSpec, objectMap.get("loadSpec"));
Assert.assertEquals("dim1,dim2", objectMap.get("dimensions"));
Assert.assertEquals("met1,met2", objectMap.get("metrics"));
Assert.assertEquals(ImmutableMap.of("type", "numbered", "partitionNum", 3, "partitions", 0), objectMap.get("shardSpec"));
Assert.assertEquals(TEST_VERSION, objectMap.get("binaryVersion"));
Assert.assertEquals(1, objectMap.get("size"));
Assert.assertEquals(6, ((Map) objectMap.get("lastCompactionState")).size());
DataSegment deserializedSegment = MAPPER.readValue(MAPPER.writeValueAsString(segment), DataSegment.class);
Assert.assertEquals(segment.getDataSource(), deserializedSegment.getDataSource());
Assert.assertEquals(segment.getInterval(), deserializedSegment.getInterval());
Assert.assertEquals(segment.getVersion(), deserializedSegment.getVersion());
Assert.assertEquals(segment.getLoadSpec(), deserializedSegment.getLoadSpec());
Assert.assertEquals(segment.getDimensions(), deserializedSegment.getDimensions());
Assert.assertEquals(segment.getMetrics(), deserializedSegment.getMetrics());
Assert.assertEquals(segment.getShardSpec(), deserializedSegment.getShardSpec());
Assert.assertEquals(segment.getSize(), deserializedSegment.getSize());
Assert.assertEquals(segment.getId(), deserializedSegment.getId());
Assert.assertEquals(segment.getLastCompactionState(), deserializedSegment.getLastCompactionState());
deserializedSegment = MAPPER.readValue(MAPPER.writeValueAsString(segment), DataSegment.class);
Assert.assertEquals(0, segment.compareTo(deserializedSegment));
deserializedSegment = MAPPER.readValue(MAPPER.writeValueAsString(segment), DataSegment.class);
Assert.assertEquals(0, deserializedSegment.compareTo(segment));
deserializedSegment = MAPPER.readValue(MAPPER.writeValueAsString(segment), DataSegment.class);
Assert.assertEquals(segment.hashCode(), deserializedSegment.hashCode());
}
use of org.apache.druid.indexer.partitions.HashedPartitionsSpec in project druid by druid-io.
the class IndexTask method createShardSpecsFromInput.
private PartitionAnalysis createShardSpecsFromInput(ObjectMapper jsonMapper, IndexIngestionSpec ingestionSchema, InputSource inputSource, File tmpDir, GranularitySpec granularitySpec, @Nonnull PartitionsSpec partitionsSpec, boolean determineIntervals) throws IOException {
assert partitionsSpec.getType() != SecondaryPartitionType.RANGE;
long determineShardSpecsStartMillis = System.currentTimeMillis();
final Map<Interval, Optional<HyperLogLogCollector>> hllCollectors = collectIntervalsAndShardSpecs(jsonMapper, ingestionSchema, inputSource, tmpDir, granularitySpec, partitionsSpec, determineIntervals);
final PartitionAnalysis<Integer, ?> partitionAnalysis;
if (partitionsSpec.getType() == SecondaryPartitionType.LINEAR) {
partitionAnalysis = new LinearPartitionAnalysis((DynamicPartitionsSpec) partitionsSpec);
} else if (partitionsSpec.getType() == SecondaryPartitionType.HASH) {
partitionAnalysis = new HashPartitionAnalysis((HashedPartitionsSpec) partitionsSpec);
} else {
throw new UOE("%s", partitionsSpec.getClass().getName());
}
for (final Map.Entry<Interval, Optional<HyperLogLogCollector>> entry : hllCollectors.entrySet()) {
final Interval interval = entry.getKey();
final int numBucketsPerInterval;
if (partitionsSpec.getType() == SecondaryPartitionType.HASH) {
final HashedPartitionsSpec hashedPartitionsSpec = (HashedPartitionsSpec) partitionsSpec;
final HyperLogLogCollector collector = entry.getValue().orNull();
if (partitionsSpec.needsDeterminePartitions(false)) {
final long numRows = Preconditions.checkNotNull(collector, "HLL collector").estimateCardinalityRound();
final int nonNullMaxRowsPerSegment = partitionsSpec.getMaxRowsPerSegment() == null ? PartitionsSpec.DEFAULT_MAX_ROWS_PER_SEGMENT : partitionsSpec.getMaxRowsPerSegment();
numBucketsPerInterval = (int) Math.ceil((double) numRows / nonNullMaxRowsPerSegment);
log.info("Estimated [%,d] rows of data for interval [%s], creating [%,d] shards", numRows, interval, numBucketsPerInterval);
} else {
numBucketsPerInterval = hashedPartitionsSpec.getNumShards() == null ? 1 : hashedPartitionsSpec.getNumShards();
log.info("Creating [%,d] buckets for interval [%s]", numBucketsPerInterval, interval);
}
} else {
numBucketsPerInterval = 1;
}
partitionAnalysis.updateBucket(interval, numBucketsPerInterval);
}
log.info("Found intervals and shardSpecs in %,dms", System.currentTimeMillis() - determineShardSpecsStartMillis);
return partitionAnalysis;
}
use of org.apache.druid.indexer.partitions.HashedPartitionsSpec in project druid by druid-io.
the class PartialDimensionCardinalityTask method runTask.
@Override
public TaskStatus runTask(TaskToolbox toolbox) throws Exception {
DataSchema dataSchema = ingestionSchema.getDataSchema();
GranularitySpec granularitySpec = dataSchema.getGranularitySpec();
ParallelIndexTuningConfig tuningConfig = ingestionSchema.getTuningConfig();
HashedPartitionsSpec partitionsSpec = (HashedPartitionsSpec) tuningConfig.getPartitionsSpec();
Preconditions.checkNotNull(partitionsSpec, "partitionsSpec required in tuningConfig");
InputSource inputSource = ingestionSchema.getIOConfig().getNonNullInputSource(ingestionSchema.getDataSchema().getParser());
InputFormat inputFormat = inputSource.needsFormat() ? ParallelIndexSupervisorTask.getInputFormat(ingestionSchema) : null;
final RowIngestionMeters buildSegmentsMeters = toolbox.getRowIngestionMetersFactory().createRowIngestionMeters();
final ParseExceptionHandler parseExceptionHandler = new ParseExceptionHandler(buildSegmentsMeters, tuningConfig.isLogParseExceptions(), tuningConfig.getMaxParseExceptions(), tuningConfig.getMaxSavedParseExceptions());
final boolean determineIntervals = granularitySpec.inputIntervals().isEmpty();
try (final CloseableIterator<InputRow> inputRowIterator = AbstractBatchIndexTask.inputSourceReader(toolbox.getIndexingTmpDir(), dataSchema, inputSource, inputFormat, determineIntervals ? Objects::nonNull : AbstractBatchIndexTask.defaultRowFilter(granularitySpec), buildSegmentsMeters, parseExceptionHandler)) {
Map<Interval, byte[]> cardinalities = determineCardinalities(inputRowIterator, granularitySpec);
sendReport(toolbox, new DimensionCardinalityReport(getId(), cardinalities));
}
return TaskStatus.success(getId());
}
Aggregations