use of org.apache.druid.segment.indexing.granularity.GranularitySpec in project druid by druid-io.
the class RangePartitionIndexTaskInputRowIteratorBuilderTest method throwsExceptionIfMultipleDimensionValues.
@Test
public void throwsExceptionIfMultipleDimensionValues() {
DateTime timestamp = IndexTaskInputRowIteratorBuilderTestingFactory.TIMESTAMP;
List<String> multipleDimensionValues = Arrays.asList("multiple", "dimension", "values");
InputRow inputRow = IndexTaskInputRowIteratorBuilderTestingFactory.createInputRow(timestamp, multipleDimensionValues);
CloseableIterator<InputRow> inputRowIterator = IndexTaskInputRowIteratorBuilderTestingFactory.createInputRowIterator(inputRow);
GranularitySpec granularitySpec = IndexTaskInputRowIteratorBuilderTestingFactory.createGranularitySpec(timestamp, IndexTaskInputRowIteratorBuilderTestingFactory.PRESENT_BUCKET_INTERVAL_OPT);
exception.expect(IllegalArgumentException.class);
exception.expectMessage("Cannot partition on multi-value dimension [dimension]");
HANDLER_TESTER.invokeHandlers(inputRowIterator, granularitySpec, NO_NEXT_INPUT_ROW);
}
use of org.apache.druid.segment.indexing.granularity.GranularitySpec in project druid by druid-io.
the class RangePartitionIndexTaskInputRowIteratorBuilderTest method invokesDimensionValueCountFilterLast.
@Test
public void invokesDimensionValueCountFilterLast() {
DateTime timestamp = IndexTaskInputRowIteratorBuilderTestingFactory.TIMESTAMP;
// Rows.objectToStrings() returns empty list for null
List<String> nullDimensionValue = Collections.emptyList();
InputRow inputRow = IndexTaskInputRowIteratorBuilderTestingFactory.createInputRow(timestamp, nullDimensionValue);
CloseableIterator<InputRow> inputRowIterator = IndexTaskInputRowIteratorBuilderTestingFactory.createInputRowIterator(inputRow);
GranularitySpec granularitySpec = IndexTaskInputRowIteratorBuilderTestingFactory.createGranularitySpec(timestamp, IndexTaskInputRowIteratorBuilderTestingFactory.PRESENT_BUCKET_INTERVAL_OPT);
List<IndexTaskInputRowIteratorBuilderTestingFactory.HandlerTester.Handler> handlerInvocationHistory = HANDLER_TESTER.invokeHandlers(inputRowIterator, granularitySpec, NO_NEXT_INPUT_ROW);
assertNotInHandlerInvocationHistory(handlerInvocationHistory, IndexTaskInputRowIteratorBuilderTestingFactory.HandlerTester.Handler.NULL_ROW);
assertNotInHandlerInvocationHistory(handlerInvocationHistory, IndexTaskInputRowIteratorBuilderTestingFactory.HandlerTester.Handler.ABSENT_BUCKET_INTERVAL);
}
use of org.apache.druid.segment.indexing.granularity.GranularitySpec in project druid by druid-io.
the class SinglePhaseSubTask method generateAndPushSegments.
/**
* This method reads input data row by row and adds the read row to a proper segment using {@link BaseAppenderatorDriver}.
* If there is no segment for the row, a new one is created. Segments can be published in the middle of reading inputs
* if one of below conditions are satisfied.
*
* <ul>
* <li>
* If the number of rows in a segment exceeds {@link DynamicPartitionsSpec#maxRowsPerSegment}
* </li>
* <li>
* If the number of rows added to {@link BaseAppenderatorDriver} so far exceeds {@link DynamicPartitionsSpec#maxTotalRows}
* </li>
* </ul>
* <p>
* At the end of this method, all the remaining segments are published.
*
* @return true if generated segments are successfully published, otherwise false
*/
private Set<DataSegment> generateAndPushSegments(final TaskToolbox toolbox, final ParallelIndexSupervisorTaskClient taskClient, final InputSource inputSource, final File tmpDir) throws IOException, InterruptedException {
final DataSchema dataSchema = ingestionSchema.getDataSchema();
final GranularitySpec granularitySpec = dataSchema.getGranularitySpec();
final FireDepartment fireDepartmentForMetrics = new FireDepartment(dataSchema, new RealtimeIOConfig(null, null), null);
final FireDepartmentMetrics fireDepartmentMetrics = fireDepartmentForMetrics.getMetrics();
toolbox.addMonitor(new RealtimeMetricsMonitor(Collections.singletonList(fireDepartmentForMetrics), Collections.singletonMap(DruidMetrics.TASK_ID, new String[] { getId() })));
final ParallelIndexTuningConfig tuningConfig = ingestionSchema.getTuningConfig();
final DynamicPartitionsSpec partitionsSpec = (DynamicPartitionsSpec) tuningConfig.getGivenOrDefaultPartitionsSpec();
final long pushTimeout = tuningConfig.getPushTimeout();
final boolean explicitIntervals = !granularitySpec.inputIntervals().isEmpty();
final boolean useLineageBasedSegmentAllocation = getContextValue(SinglePhaseParallelIndexTaskRunner.CTX_USE_LINEAGE_BASED_SEGMENT_ALLOCATION_KEY, SinglePhaseParallelIndexTaskRunner.LEGACY_DEFAULT_USE_LINEAGE_BASED_SEGMENT_ALLOCATION);
// subtaskSpecId is used as the sequenceName, so that retry tasks for the same spec
// can allocate the same set of segments.
final String sequenceName = useLineageBasedSegmentAllocation ? Preconditions.checkNotNull(subtaskSpecId, "subtaskSpecId") : getId();
final SegmentAllocatorForBatch segmentAllocator = SegmentAllocators.forLinearPartitioning(toolbox, sequenceName, new SupervisorTaskAccess(getSupervisorTaskId(), taskClient), getIngestionSchema().getDataSchema(), getTaskLockHelper(), ingestionSchema.getIOConfig().isAppendToExisting(), partitionsSpec, useLineageBasedSegmentAllocation);
final boolean useMaxMemoryEstimates = getContextValue(Tasks.USE_MAX_MEMORY_ESTIMATES, Tasks.DEFAULT_USE_MAX_MEMORY_ESTIMATES);
final Appenderator appenderator = BatchAppenderators.newAppenderator(getId(), toolbox.getAppenderatorsManager(), fireDepartmentMetrics, toolbox, dataSchema, tuningConfig, rowIngestionMeters, parseExceptionHandler, useMaxMemoryEstimates);
boolean exceptionOccurred = false;
try (final BatchAppenderatorDriver driver = BatchAppenderators.newDriver(appenderator, toolbox, segmentAllocator);
final CloseableIterator<InputRow> inputRowIterator = AbstractBatchIndexTask.inputSourceReader(tmpDir, dataSchema, inputSource, inputSource.needsFormat() ? ParallelIndexSupervisorTask.getInputFormat(ingestionSchema) : null, inputRow -> {
if (inputRow == null) {
return false;
}
if (explicitIntervals) {
final Optional<Interval> optInterval = granularitySpec.bucketInterval(inputRow.getTimestamp());
return optInterval.isPresent();
}
return true;
}, rowIngestionMeters, parseExceptionHandler)) {
driver.startJob();
final Set<DataSegment> pushedSegments = new HashSet<>();
while (inputRowIterator.hasNext()) {
final InputRow inputRow = inputRowIterator.next();
// Segments are created as needed, using a single sequence name. They may be allocated from the overlord
// (in append mode) or may be created on our own authority (in overwrite mode).
final AppenderatorDriverAddResult addResult = driver.add(inputRow, sequenceName);
if (addResult.isOk()) {
final boolean isPushRequired = addResult.isPushRequired(partitionsSpec.getMaxRowsPerSegment(), partitionsSpec.getMaxTotalRowsOr(DynamicPartitionsSpec.DEFAULT_MAX_TOTAL_ROWS));
if (isPushRequired) {
// There can be some segments waiting for being published even though any rows won't be added to them.
// If those segments are not published here, the available space in appenderator will be kept to be small
// which makes the size of segments smaller.
final SegmentsAndCommitMetadata pushed = driver.pushAllAndClear(pushTimeout);
pushedSegments.addAll(pushed.getSegments());
LOG.info("Pushed [%s] segments", pushed.getSegments().size());
LOG.infoSegments(pushed.getSegments(), "Pushed segments");
}
} else {
throw new ISE("Failed to add a row with timestamp[%s]", inputRow.getTimestamp());
}
fireDepartmentMetrics.incrementProcessed();
}
final SegmentsAndCommitMetadata pushed = driver.pushAllAndClear(pushTimeout);
pushedSegments.addAll(pushed.getSegments());
LOG.info("Pushed [%s] segments", pushed.getSegments().size());
LOG.infoSegments(pushed.getSegments(), "Pushed segments");
appenderator.close();
return pushedSegments;
} catch (TimeoutException | ExecutionException e) {
exceptionOccurred = true;
throw new RuntimeException(e);
} catch (Exception e) {
exceptionOccurred = true;
throw e;
} finally {
if (exceptionOccurred) {
appenderator.closeNow();
} else {
appenderator.close();
}
}
}
use of org.apache.druid.segment.indexing.granularity.GranularitySpec in project druid by druid-io.
the class ParallelIndexTestingFactory method createDataSchema.
static DataSchema createDataSchema(List<Interval> granularitySpecInputIntervals) {
GranularitySpec granularitySpec = new ArbitraryGranularitySpec(Granularities.DAY, granularitySpecInputIntervals);
TimestampSpec timestampSpec = new TimestampSpec(SCHEMA_TIME, "auto", null);
DimensionsSpec dimensionsSpec = new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of(SCHEMA_DIMENSION)));
return new DataSchema(DATASOURCE, timestampSpec, dimensionsSpec, new AggregatorFactory[] {}, granularitySpec, TransformSpec.NONE, null, NESTED_OBJECT_MAPPER);
}
use of org.apache.druid.segment.indexing.granularity.GranularitySpec in project druid by druid-io.
the class AbstractMultiPhaseParallelIndexingTest method newTask.
protected ParallelIndexSupervisorTask newTask(@Nullable TimestampSpec timestampSpec, @Nullable DimensionsSpec dimensionsSpec, @Nullable InputFormat inputFormat, @Nullable ParseSpec parseSpec, Interval interval, File inputDir, String filter, PartitionsSpec partitionsSpec, int maxNumConcurrentSubTasks, boolean appendToExisting) {
GranularitySpec granularitySpec = new UniformGranularitySpec(SEGMENT_GRANULARITY, Granularities.MINUTE, interval == null ? null : Collections.singletonList(interval));
ParallelIndexTuningConfig tuningConfig = newTuningConfig(partitionsSpec, maxNumConcurrentSubTasks, !appendToExisting);
final ParallelIndexIngestionSpec ingestionSpec;
if (useInputFormatApi) {
Preconditions.checkArgument(parseSpec == null);
ParallelIndexIOConfig ioConfig = new ParallelIndexIOConfig(null, new LocalInputSource(inputDir, filter), inputFormat, appendToExisting, null);
ingestionSpec = new ParallelIndexIngestionSpec(new DataSchema(DATASOURCE, timestampSpec, dimensionsSpec, new AggregatorFactory[] { new LongSumAggregatorFactory("val", "val") }, granularitySpec, null), ioConfig, tuningConfig);
} else {
Preconditions.checkArgument(inputFormat == null);
ParallelIndexIOConfig ioConfig = new ParallelIndexIOConfig(new LocalFirehoseFactory(inputDir, filter, null), appendToExisting);
// noinspection unchecked
ingestionSpec = new ParallelIndexIngestionSpec(new DataSchema("dataSource", getObjectMapper().convertValue(new StringInputRowParser(parseSpec, null), Map.class), new AggregatorFactory[] { new LongSumAggregatorFactory("val", "val") }, granularitySpec, null, getObjectMapper()), ioConfig, tuningConfig);
}
// set up test tools
return new ParallelIndexSupervisorTask(null, null, null, ingestionSpec, Collections.emptyMap());
}
Aggregations