use of org.apache.druid.indexer.partitions.DynamicPartitionsSpec in project druid by druid-io.
the class AbstractGcsInputSourceParallelIndexTest method doTest.
void doTest(Pair<String, List> gcsInputSource, Pair<Boolean, Boolean> segmentAvailabilityConfirmationPair) throws Exception {
final String indexDatasource = "wikipedia_index_test_" + UUID.randomUUID();
try (final Closeable ignored1 = unloader(indexDatasource + config.getExtraDatasourceNameSuffix())) {
final Function<String, String> gcsPropsTransform = spec -> {
try {
String inputSourceValue = jsonMapper.writeValueAsString(gcsInputSource.rhs);
inputSourceValue = StringUtils.replace(inputSourceValue, "%%BUCKET%%", config.getCloudBucket());
inputSourceValue = StringUtils.replace(inputSourceValue, "%%PATH%%", config.getCloudPath());
spec = StringUtils.replace(spec, "%%INPUT_FORMAT_TYPE%%", InputFormatDetails.JSON.getInputFormatType());
spec = StringUtils.replace(spec, "%%PARTITIONS_SPEC%%", jsonMapper.writeValueAsString(new DynamicPartitionsSpec(null, null)));
spec = StringUtils.replace(spec, "%%INPUT_SOURCE_TYPE%%", "google");
spec = StringUtils.replace(spec, "%%INPUT_SOURCE_PROPERTY_KEY%%", gcsInputSource.lhs);
return StringUtils.replace(spec, "%%INPUT_SOURCE_PROPERTY_VALUE%%", inputSourceValue);
} catch (Exception e) {
throw new RuntimeException(e);
}
};
doIndexTest(indexDatasource, INDEX_TASK, gcsPropsTransform, INDEX_QUERIES_RESOURCE, false, true, true, segmentAvailabilityConfirmationPair);
}
}
use of org.apache.druid.indexer.partitions.DynamicPartitionsSpec in project druid by druid-io.
the class ITOverwriteBatchIndexTest method submitIngestionTaskAndVerify.
private void submitIngestionTaskAndVerify(String indexDatasource, String fileFilter, boolean dropExisting) throws Exception {
Map inputFormatMap = new ImmutableMap.Builder<String, Object>().put("type", INPUT_FORMAT_DETAILS.getInputFormatType()).build();
final Function<String, String> sqlInputSourcePropsTransform = spec -> {
try {
spec = StringUtils.replace(spec, "%%PARTITIONS_SPEC%%", jsonMapper.writeValueAsString(new DynamicPartitionsSpec(null, null)));
spec = StringUtils.replace(spec, "%%INPUT_SOURCE_FILTER%%", fileFilter);
spec = StringUtils.replace(spec, "%%INPUT_SOURCE_BASE_DIR%%", "/resources/data/batch_index" + INPUT_FORMAT_DETAILS.getFolderSuffix());
spec = StringUtils.replace(spec, "%%INPUT_FORMAT%%", jsonMapper.writeValueAsString(inputFormatMap));
spec = StringUtils.replace(spec, "%%APPEND_TO_EXISTING%%", jsonMapper.writeValueAsString(false));
spec = StringUtils.replace(spec, "%%DROP_EXISTING%%", jsonMapper.writeValueAsString(dropExisting));
spec = StringUtils.replace(spec, "%%FORCE_GUARANTEED_ROLLUP%%", jsonMapper.writeValueAsString(false));
return spec;
} catch (Exception e) {
throw new RuntimeException(e);
}
};
doIndexTest(indexDatasource, INDEX_TASK, sqlInputSourcePropsTransform, null, false, false, true, new Pair<>(false, false));
}
use of org.apache.druid.indexer.partitions.DynamicPartitionsSpec in project druid by druid-io.
the class DataSegmentTest method testWithLastCompactionState.
@Test
public void testWithLastCompactionState() {
final CompactionState compactionState = new CompactionState(new DynamicPartitionsSpec(null, null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("bar", "foo"))), ImmutableList.of(ImmutableMap.of("type", "count", "name", "count")), ImmutableMap.of("filter", ImmutableMap.of("type", "selector", "dimension", "dim1", "value", "foo")), Collections.singletonMap("test", "map"), Collections.singletonMap("test2", "map2"));
final DataSegment segment1 = DataSegment.builder().dataSource("foo").interval(Intervals.of("2012-01-01/2012-01-02")).version(DateTimes.of("2012-01-01T11:22:33.444Z").toString()).shardSpec(getShardSpec(7)).size(0).lastCompactionState(compactionState).build();
final DataSegment segment2 = DataSegment.builder().dataSource("foo").interval(Intervals.of("2012-01-01/2012-01-02")).version(DateTimes.of("2012-01-01T11:22:33.444Z").toString()).shardSpec(getShardSpec(7)).size(0).build();
Assert.assertEquals(segment1, segment2.withLastCompactionState(compactionState));
}
use of org.apache.druid.indexer.partitions.DynamicPartitionsSpec in project druid by druid-io.
the class IndexTask method generateAndPublishSegments.
/**
* This method reads input data row by row and adds the read row to a proper segment using {@link BaseAppenderatorDriver}.
* If there is no segment for the row, a new one is created. Segments can be published in the middle of reading inputs
* if {@link DynamicPartitionsSpec} is used and one of below conditions are satisfied.
*
* <ul>
* <li>
* If the number of rows in a segment exceeds {@link DynamicPartitionsSpec#maxRowsPerSegment}
* </li>
* <li>
* If the number of rows added to {@link BaseAppenderatorDriver} so far exceeds {@link DynamicPartitionsSpec#maxTotalRows}
* </li>
* </ul>
* <p>
* At the end of this method, all the remaining segments are published.
*
* @return the last {@link TaskStatus}
*/
private TaskStatus generateAndPublishSegments(final TaskToolbox toolbox, final DataSchema dataSchema, final InputSource inputSource, final File tmpDir, final PartitionAnalysis partitionAnalysis) throws IOException, InterruptedException {
final FireDepartment fireDepartmentForMetrics = new FireDepartment(dataSchema, new RealtimeIOConfig(null, null), null);
FireDepartmentMetrics buildSegmentsFireDepartmentMetrics = fireDepartmentForMetrics.getMetrics();
if (toolbox.getMonitorScheduler() != null) {
final TaskRealtimeMetricsMonitor metricsMonitor = TaskRealtimeMetricsMonitorBuilder.build(this, fireDepartmentForMetrics, buildSegmentsMeters);
toolbox.getMonitorScheduler().addMonitor(metricsMonitor);
}
final PartitionsSpec partitionsSpec = partitionAnalysis.getPartitionsSpec();
final IndexTuningConfig tuningConfig = ingestionSchema.getTuningConfig();
final long pushTimeout = tuningConfig.getPushTimeout();
final SegmentAllocatorForBatch segmentAllocator;
final SequenceNameFunction sequenceNameFunction;
switch(partitionsSpec.getType()) {
case HASH:
case RANGE:
final SegmentAllocatorForBatch localSegmentAllocator = SegmentAllocators.forNonLinearPartitioning(toolbox, getDataSource(), baseSequenceName, dataSchema.getGranularitySpec(), null, (CompletePartitionAnalysis) partitionAnalysis);
sequenceNameFunction = localSegmentAllocator.getSequenceNameFunction();
segmentAllocator = localSegmentAllocator;
break;
case LINEAR:
segmentAllocator = SegmentAllocators.forLinearPartitioning(toolbox, baseSequenceName, null, dataSchema, getTaskLockHelper(), ingestionSchema.getIOConfig().isAppendToExisting(), partitionAnalysis.getPartitionsSpec(), null);
sequenceNameFunction = segmentAllocator.getSequenceNameFunction();
break;
default:
throw new UOE("[%s] secondary partition type is not supported", partitionsSpec.getType());
}
Set<DataSegment> segmentsFoundForDrop = null;
if (ingestionSchema.getIOConfig().isDropExisting()) {
segmentsFoundForDrop = getUsedSegmentsWithinInterval(toolbox, getDataSource(), ingestionSchema.getDataSchema().getGranularitySpec().inputIntervals());
}
final TransactionalSegmentPublisher publisher = (segmentsToBeOverwritten, segmentsToDrop, segmentsToPublish, commitMetadata) -> toolbox.getTaskActionClient().submit(SegmentTransactionalInsertAction.overwriteAction(segmentsToBeOverwritten, segmentsToDrop, segmentsToPublish));
String effectiveId = getContextValue(CompactionTask.CTX_KEY_APPENDERATOR_TRACKING_TASK_ID, null);
if (effectiveId == null) {
effectiveId = getId();
}
final Appenderator appenderator = BatchAppenderators.newAppenderator(effectiveId, toolbox.getAppenderatorsManager(), buildSegmentsFireDepartmentMetrics, toolbox, dataSchema, tuningConfig, buildSegmentsMeters, buildSegmentsParseExceptionHandler, isUseMaxMemoryEstimates());
boolean exceptionOccurred = false;
try (final BatchAppenderatorDriver driver = BatchAppenderators.newDriver(appenderator, toolbox, segmentAllocator)) {
driver.startJob();
InputSourceProcessor.process(dataSchema, driver, partitionsSpec, inputSource, inputSource.needsFormat() ? getInputFormat(ingestionSchema) : null, tmpDir, sequenceNameFunction, new DefaultIndexTaskInputRowIteratorBuilder(), buildSegmentsMeters, buildSegmentsParseExceptionHandler, pushTimeout);
// If we use timeChunk lock, then we don't have to specify what segments will be overwritten because
// it will just overwrite all segments overlapped with the new segments.
final Set<DataSegment> inputSegments = getTaskLockHelper().isUseSegmentLock() ? getTaskLockHelper().getLockedExistingSegments() : null;
final boolean storeCompactionState = getContextValue(Tasks.STORE_COMPACTION_STATE_KEY, Tasks.DEFAULT_STORE_COMPACTION_STATE);
final Function<Set<DataSegment>, Set<DataSegment>> annotateFunction = compactionStateAnnotateFunction(storeCompactionState, toolbox, ingestionSchema);
// Probably we can publish atomicUpdateGroup along with segments.
final SegmentsAndCommitMetadata published = awaitPublish(driver.publishAll(inputSegments, segmentsFoundForDrop, publisher, annotateFunction), pushTimeout);
appenderator.close();
// for awaitSegmentAvailabilityTimeoutMillis
if (tuningConfig.getAwaitSegmentAvailabilityTimeoutMillis() > 0 && published != null) {
ingestionState = IngestionState.SEGMENT_AVAILABILITY_WAIT;
ArrayList<DataSegment> segmentsToWaitFor = new ArrayList<>(published.getSegments());
waitForSegmentAvailability(toolbox, segmentsToWaitFor, tuningConfig.getAwaitSegmentAvailabilityTimeoutMillis());
}
ingestionState = IngestionState.COMPLETED;
if (published == null) {
log.error("Failed to publish segments, aborting!");
errorMsg = "Failed to publish segments.";
toolbox.getTaskReportFileWriter().write(getId(), getTaskCompletionReports());
return TaskStatus.failure(getId(), errorMsg);
} else {
log.info("Processed[%,d] events, unparseable[%,d], thrownAway[%,d].", buildSegmentsMeters.getProcessed(), buildSegmentsMeters.getUnparseable(), buildSegmentsMeters.getThrownAway());
log.info("Published [%s] segments", published.getSegments().size());
log.debugSegments(published.getSegments(), "Published segments");
toolbox.getTaskReportFileWriter().write(getId(), getTaskCompletionReports());
return TaskStatus.success(getId());
}
} catch (TimeoutException | ExecutionException e) {
exceptionOccurred = true;
throw new RuntimeException(e);
} catch (Exception e) {
exceptionOccurred = true;
throw e;
} finally {
if (exceptionOccurred) {
appenderator.closeNow();
} else {
appenderator.close();
}
}
}
use of org.apache.druid.indexer.partitions.DynamicPartitionsSpec in project druid by druid-io.
the class IndexTask method createShardSpecsFromInput.
private PartitionAnalysis createShardSpecsFromInput(ObjectMapper jsonMapper, IndexIngestionSpec ingestionSchema, InputSource inputSource, File tmpDir, GranularitySpec granularitySpec, @Nonnull PartitionsSpec partitionsSpec, boolean determineIntervals) throws IOException {
assert partitionsSpec.getType() != SecondaryPartitionType.RANGE;
long determineShardSpecsStartMillis = System.currentTimeMillis();
final Map<Interval, Optional<HyperLogLogCollector>> hllCollectors = collectIntervalsAndShardSpecs(jsonMapper, ingestionSchema, inputSource, tmpDir, granularitySpec, partitionsSpec, determineIntervals);
final PartitionAnalysis<Integer, ?> partitionAnalysis;
if (partitionsSpec.getType() == SecondaryPartitionType.LINEAR) {
partitionAnalysis = new LinearPartitionAnalysis((DynamicPartitionsSpec) partitionsSpec);
} else if (partitionsSpec.getType() == SecondaryPartitionType.HASH) {
partitionAnalysis = new HashPartitionAnalysis((HashedPartitionsSpec) partitionsSpec);
} else {
throw new UOE("%s", partitionsSpec.getClass().getName());
}
for (final Map.Entry<Interval, Optional<HyperLogLogCollector>> entry : hllCollectors.entrySet()) {
final Interval interval = entry.getKey();
final int numBucketsPerInterval;
if (partitionsSpec.getType() == SecondaryPartitionType.HASH) {
final HashedPartitionsSpec hashedPartitionsSpec = (HashedPartitionsSpec) partitionsSpec;
final HyperLogLogCollector collector = entry.getValue().orNull();
if (partitionsSpec.needsDeterminePartitions(false)) {
final long numRows = Preconditions.checkNotNull(collector, "HLL collector").estimateCardinalityRound();
final int nonNullMaxRowsPerSegment = partitionsSpec.getMaxRowsPerSegment() == null ? PartitionsSpec.DEFAULT_MAX_ROWS_PER_SEGMENT : partitionsSpec.getMaxRowsPerSegment();
numBucketsPerInterval = (int) Math.ceil((double) numRows / nonNullMaxRowsPerSegment);
log.info("Estimated [%,d] rows of data for interval [%s], creating [%,d] shards", numRows, interval, numBucketsPerInterval);
} else {
numBucketsPerInterval = hashedPartitionsSpec.getNumShards() == null ? 1 : hashedPartitionsSpec.getNumShards();
log.info("Creating [%,d] buckets for interval [%s]", numBucketsPerInterval, interval);
}
} else {
numBucketsPerInterval = 1;
}
partitionAnalysis.updateBucket(interval, numBucketsPerInterval);
}
log.info("Found intervals and shardSpecs in %,dms", System.currentTimeMillis() - determineShardSpecsStartMillis);
return partitionAnalysis;
}
Aggregations