use of org.apache.druid.data.input.InputSource in project druid by druid-io.
the class InputSourceSamplerTest method testWithTransformsAutoDimensions.
@Test
public void testWithTransformsAutoDimensions() throws IOException {
final TimestampSpec timestampSpec = new TimestampSpec("t", null, null);
final DimensionsSpec dimensionsSpec = new DimensionsSpec(null);
final TransformSpec transformSpec = new TransformSpec(null, ImmutableList.of(new ExpressionTransform("dim1PlusBar", "concat(dim1, 'bar')", TestExprMacroTable.INSTANCE)));
final AggregatorFactory[] aggregatorFactories = { new LongSumAggregatorFactory("met1", "met1") };
final GranularitySpec granularitySpec = new UniformGranularitySpec(Granularities.DAY, Granularities.HOUR, true, null);
final DataSchema dataSchema = createDataSchema(timestampSpec, dimensionsSpec, aggregatorFactories, granularitySpec, transformSpec);
final InputSource inputSource = createInputSource(getTestRows(), dataSchema);
final InputFormat inputFormat = createInputFormat();
SamplerResponse response = inputSourceSampler.sample(inputSource, inputFormat, dataSchema, null);
Assert.assertEquals(6, response.getNumRowsRead());
Assert.assertEquals(5, response.getNumRowsIndexed());
Assert.assertEquals(4, response.getData().size());
List<SamplerResponseRow> data = response.getData();
assertEqualsSamplerResponseRow(new SamplerResponseRow(getRawColumns().get(0), new SamplerTestUtils.MapAllowingNullValuesBuilder<String, Object>().put("__time", 1555934400000L).put("dim1", "foo").put("dim2", null).put("met1", 6L).build(), null, null), data.get(0));
assertEqualsSamplerResponseRow(new SamplerResponseRow(getRawColumns().get(3), new SamplerTestUtils.MapAllowingNullValuesBuilder<String, Object>().put("__time", 1555934400000L).put("dim1", "foo2").put("dim2", null).put("met1", 4L).build(), null, null), data.get(1));
assertEqualsSamplerResponseRow(new SamplerResponseRow(getRawColumns().get(4), new SamplerTestUtils.MapAllowingNullValuesBuilder<String, Object>().put("__time", 1555934400000L).put("dim1", "foo").put("dim2", "bar").put("met1", 5L).build(), null, null), data.get(2));
assertEqualsSamplerResponseRow(new SamplerResponseRow(getRawColumns().get(5), null, true, getUnparseableTimestampString()), data.get(3));
}
use of org.apache.druid.data.input.InputSource in project druid by druid-io.
the class InputSourceSamplerTest method testWithNoRollup.
@Test
public void testWithNoRollup() throws IOException {
final TimestampSpec timestampSpec = new TimestampSpec("t", null, null);
final DimensionsSpec dimensionsSpec = new DimensionsSpec(null);
final AggregatorFactory[] aggregatorFactories = { new LongSumAggregatorFactory("met1", "met1") };
final GranularitySpec granularitySpec = new UniformGranularitySpec(Granularities.DAY, Granularities.HOUR, false, null);
final DataSchema dataSchema = createDataSchema(timestampSpec, dimensionsSpec, aggregatorFactories, granularitySpec, null);
final InputSource inputSource = createInputSource(getTestRows(), dataSchema);
final InputFormat inputFormat = createInputFormat();
SamplerResponse response = inputSourceSampler.sample(inputSource, inputFormat, dataSchema, null);
Assert.assertEquals(6, response.getNumRowsRead());
Assert.assertEquals(5, response.getNumRowsIndexed());
Assert.assertEquals(6, response.getData().size());
List<SamplerResponseRow> data = response.getData();
assertEqualsSamplerResponseRow(new SamplerResponseRow(getRawColumns().get(0), new SamplerTestUtils.MapAllowingNullValuesBuilder<String, Object>().put("__time", 1555934400000L).put("dim1", "foo").put("dim2", null).put("met1", 1L).build(), null, null), data.get(0));
assertEqualsSamplerResponseRow(new SamplerResponseRow(getRawColumns().get(1), new SamplerTestUtils.MapAllowingNullValuesBuilder<String, Object>().put("__time", 1555934400000L).put("dim1", "foo").put("dim2", null).put("met1", 2L).build(), null, null), data.get(1));
assertEqualsSamplerResponseRow(new SamplerResponseRow(getRawColumns().get(2), new SamplerTestUtils.MapAllowingNullValuesBuilder<String, Object>().put("__time", 1555934400000L).put("dim1", "foo").put("dim2", null).put("met1", 3L).build(), null, null), data.get(2));
assertEqualsSamplerResponseRow(new SamplerResponseRow(getRawColumns().get(3), new SamplerTestUtils.MapAllowingNullValuesBuilder<String, Object>().put("__time", 1555934400000L).put("dim1", "foo2").put("dim2", null).put("met1", 4L).build(), null, null), data.get(3));
assertEqualsSamplerResponseRow(new SamplerResponseRow(getRawColumns().get(4), new SamplerTestUtils.MapAllowingNullValuesBuilder<String, Object>().put("__time", 1555934400000L).put("dim1", "foo").put("dim2", "bar").put("met1", 5L).build(), null, null), data.get(4));
assertEqualsSamplerResponseRow(new SamplerResponseRow(getRawColumns().get(5), null, true, getUnparseableTimestampString()), data.get(5));
}
use of org.apache.druid.data.input.InputSource in project druid by druid-io.
the class IndexTask method generateAndPublishSegments.
/**
* This method reads input data row by row and adds the read row to a proper segment using {@link BaseAppenderatorDriver}.
* If there is no segment for the row, a new one is created. Segments can be published in the middle of reading inputs
* if {@link DynamicPartitionsSpec} is used and one of below conditions are satisfied.
*
* <ul>
* <li>
* If the number of rows in a segment exceeds {@link DynamicPartitionsSpec#maxRowsPerSegment}
* </li>
* <li>
* If the number of rows added to {@link BaseAppenderatorDriver} so far exceeds {@link DynamicPartitionsSpec#maxTotalRows}
* </li>
* </ul>
* <p>
* At the end of this method, all the remaining segments are published.
*
* @return the last {@link TaskStatus}
*/
private TaskStatus generateAndPublishSegments(final TaskToolbox toolbox, final DataSchema dataSchema, final InputSource inputSource, final File tmpDir, final PartitionAnalysis partitionAnalysis) throws IOException, InterruptedException {
final FireDepartment fireDepartmentForMetrics = new FireDepartment(dataSchema, new RealtimeIOConfig(null, null), null);
FireDepartmentMetrics buildSegmentsFireDepartmentMetrics = fireDepartmentForMetrics.getMetrics();
if (toolbox.getMonitorScheduler() != null) {
final TaskRealtimeMetricsMonitor metricsMonitor = TaskRealtimeMetricsMonitorBuilder.build(this, fireDepartmentForMetrics, buildSegmentsMeters);
toolbox.getMonitorScheduler().addMonitor(metricsMonitor);
}
final PartitionsSpec partitionsSpec = partitionAnalysis.getPartitionsSpec();
final IndexTuningConfig tuningConfig = ingestionSchema.getTuningConfig();
final long pushTimeout = tuningConfig.getPushTimeout();
final SegmentAllocatorForBatch segmentAllocator;
final SequenceNameFunction sequenceNameFunction;
switch(partitionsSpec.getType()) {
case HASH:
case RANGE:
final SegmentAllocatorForBatch localSegmentAllocator = SegmentAllocators.forNonLinearPartitioning(toolbox, getDataSource(), baseSequenceName, dataSchema.getGranularitySpec(), null, (CompletePartitionAnalysis) partitionAnalysis);
sequenceNameFunction = localSegmentAllocator.getSequenceNameFunction();
segmentAllocator = localSegmentAllocator;
break;
case LINEAR:
segmentAllocator = SegmentAllocators.forLinearPartitioning(toolbox, baseSequenceName, null, dataSchema, getTaskLockHelper(), ingestionSchema.getIOConfig().isAppendToExisting(), partitionAnalysis.getPartitionsSpec(), null);
sequenceNameFunction = segmentAllocator.getSequenceNameFunction();
break;
default:
throw new UOE("[%s] secondary partition type is not supported", partitionsSpec.getType());
}
Set<DataSegment> segmentsFoundForDrop = null;
if (ingestionSchema.getIOConfig().isDropExisting()) {
segmentsFoundForDrop = getUsedSegmentsWithinInterval(toolbox, getDataSource(), ingestionSchema.getDataSchema().getGranularitySpec().inputIntervals());
}
final TransactionalSegmentPublisher publisher = (segmentsToBeOverwritten, segmentsToDrop, segmentsToPublish, commitMetadata) -> toolbox.getTaskActionClient().submit(SegmentTransactionalInsertAction.overwriteAction(segmentsToBeOverwritten, segmentsToDrop, segmentsToPublish));
String effectiveId = getContextValue(CompactionTask.CTX_KEY_APPENDERATOR_TRACKING_TASK_ID, null);
if (effectiveId == null) {
effectiveId = getId();
}
final Appenderator appenderator = BatchAppenderators.newAppenderator(effectiveId, toolbox.getAppenderatorsManager(), buildSegmentsFireDepartmentMetrics, toolbox, dataSchema, tuningConfig, buildSegmentsMeters, buildSegmentsParseExceptionHandler, isUseMaxMemoryEstimates());
boolean exceptionOccurred = false;
try (final BatchAppenderatorDriver driver = BatchAppenderators.newDriver(appenderator, toolbox, segmentAllocator)) {
driver.startJob();
InputSourceProcessor.process(dataSchema, driver, partitionsSpec, inputSource, inputSource.needsFormat() ? getInputFormat(ingestionSchema) : null, tmpDir, sequenceNameFunction, new DefaultIndexTaskInputRowIteratorBuilder(), buildSegmentsMeters, buildSegmentsParseExceptionHandler, pushTimeout);
// If we use timeChunk lock, then we don't have to specify what segments will be overwritten because
// it will just overwrite all segments overlapped with the new segments.
final Set<DataSegment> inputSegments = getTaskLockHelper().isUseSegmentLock() ? getTaskLockHelper().getLockedExistingSegments() : null;
final boolean storeCompactionState = getContextValue(Tasks.STORE_COMPACTION_STATE_KEY, Tasks.DEFAULT_STORE_COMPACTION_STATE);
final Function<Set<DataSegment>, Set<DataSegment>> annotateFunction = compactionStateAnnotateFunction(storeCompactionState, toolbox, ingestionSchema);
// Probably we can publish atomicUpdateGroup along with segments.
final SegmentsAndCommitMetadata published = awaitPublish(driver.publishAll(inputSegments, segmentsFoundForDrop, publisher, annotateFunction), pushTimeout);
appenderator.close();
// for awaitSegmentAvailabilityTimeoutMillis
if (tuningConfig.getAwaitSegmentAvailabilityTimeoutMillis() > 0 && published != null) {
ingestionState = IngestionState.SEGMENT_AVAILABILITY_WAIT;
ArrayList<DataSegment> segmentsToWaitFor = new ArrayList<>(published.getSegments());
waitForSegmentAvailability(toolbox, segmentsToWaitFor, tuningConfig.getAwaitSegmentAvailabilityTimeoutMillis());
}
ingestionState = IngestionState.COMPLETED;
if (published == null) {
log.error("Failed to publish segments, aborting!");
errorMsg = "Failed to publish segments.";
toolbox.getTaskReportFileWriter().write(getId(), getTaskCompletionReports());
return TaskStatus.failure(getId(), errorMsg);
} else {
log.info("Processed[%,d] events, unparseable[%,d], thrownAway[%,d].", buildSegmentsMeters.getProcessed(), buildSegmentsMeters.getUnparseable(), buildSegmentsMeters.getThrownAway());
log.info("Published [%s] segments", published.getSegments().size());
log.debugSegments(published.getSegments(), "Published segments");
toolbox.getTaskReportFileWriter().write(getId(), getTaskCompletionReports());
return TaskStatus.success(getId());
}
} catch (TimeoutException | ExecutionException e) {
exceptionOccurred = true;
throw new RuntimeException(e);
} catch (Exception e) {
exceptionOccurred = true;
throw e;
} finally {
if (exceptionOccurred) {
appenderator.closeNow();
} else {
appenderator.close();
}
}
}
use of org.apache.druid.data.input.InputSource in project druid by druid-io.
the class IndexTask method collectIntervalsAndShardSpecs.
private Map<Interval, Optional<HyperLogLogCollector>> collectIntervalsAndShardSpecs(ObjectMapper jsonMapper, IndexIngestionSpec ingestionSchema, InputSource inputSource, File tmpDir, GranularitySpec granularitySpec, @Nonnull PartitionsSpec partitionsSpec, boolean determineIntervals) throws IOException {
final Map<Interval, Optional<HyperLogLogCollector>> hllCollectors = new TreeMap<>(Comparators.intervalsByStartThenEnd());
final Granularity queryGranularity = granularitySpec.getQueryGranularity();
final Predicate<InputRow> rowFilter = inputRow -> {
if (inputRow == null) {
return false;
}
if (determineIntervals) {
return true;
}
final Optional<Interval> optInterval = granularitySpec.bucketInterval(inputRow.getTimestamp());
return optInterval.isPresent();
};
try (final CloseableIterator<InputRow> inputRowIterator = AbstractBatchIndexTask.inputSourceReader(tmpDir, ingestionSchema.getDataSchema(), inputSource, inputSource.needsFormat() ? getInputFormat(ingestionSchema) : null, rowFilter, determinePartitionsMeters, determinePartitionsParseExceptionHandler)) {
while (inputRowIterator.hasNext()) {
final InputRow inputRow = inputRowIterator.next();
final Interval interval;
if (determineIntervals) {
interval = granularitySpec.getSegmentGranularity().bucket(inputRow.getTimestamp());
} else {
final Optional<Interval> optInterval = granularitySpec.bucketInterval(inputRow.getTimestamp());
// this interval must exist since it passed the rowFilter
assert optInterval.isPresent();
interval = optInterval.get();
}
if (partitionsSpec.needsDeterminePartitions(false)) {
hllCollectors.computeIfAbsent(interval, intv -> Optional.of(HyperLogLogCollector.makeLatestCollector()));
List<Object> groupKey = Rows.toGroupKey(queryGranularity.bucketStart(inputRow.getTimestampFromEpoch()), inputRow);
hllCollectors.get(interval).get().add(HASH_FUNCTION.hashBytes(jsonMapper.writeValueAsBytes(groupKey)).asBytes());
} else {
// we don't need to determine partitions but we still need to determine intervals, so add an Optional.absent()
// for the interval and don't instantiate a HLL collector
hllCollectors.putIfAbsent(interval, Optional.absent());
}
determinePartitionsMeters.incrementProcessed();
}
}
// These metrics are reported in generateAndPublishSegments()
if (determinePartitionsMeters.getThrownAway() > 0) {
log.warn("Unable to find a matching interval for [%,d] events", determinePartitionsMeters.getThrownAway());
}
if (determinePartitionsMeters.getUnparseable() > 0) {
log.warn("Unable to parse [%,d] events", determinePartitionsMeters.getUnparseable());
}
return hllCollectors;
}
use of org.apache.druid.data.input.InputSource in project druid by druid-io.
the class IndexTask method runTask.
@Override
public TaskStatus runTask(final TaskToolbox toolbox) {
try {
log.debug("Found chat handler of class[%s]", toolbox.getChatHandlerProvider().getClass().getName());
if (toolbox.getChatHandlerProvider().get(getId()).isPresent()) {
// This is a workaround for ParallelIndexSupervisorTask to avoid double registering when it runs in the
// sequential mode. See ParallelIndexSupervisorTask.runSequential().
// Note that all HTTP endpoints are not available in this case. This works only for
// ParallelIndexSupervisorTask because it doesn't support APIs for live ingestion reports.
log.warn("Chat handler is already registered. Skipping chat handler registration.");
} else {
toolbox.getChatHandlerProvider().register(getId(), this, false);
}
this.authorizerMapper = toolbox.getAuthorizerMapper();
this.determinePartitionsMeters = toolbox.getRowIngestionMetersFactory().createRowIngestionMeters();
this.buildSegmentsMeters = toolbox.getRowIngestionMetersFactory().createRowIngestionMeters();
this.determinePartitionsParseExceptionHandler = new ParseExceptionHandler(determinePartitionsMeters, ingestionSchema.getTuningConfig().isLogParseExceptions(), ingestionSchema.getTuningConfig().getMaxParseExceptions(), ingestionSchema.getTuningConfig().getMaxSavedParseExceptions());
this.buildSegmentsParseExceptionHandler = new ParseExceptionHandler(buildSegmentsMeters, ingestionSchema.getTuningConfig().isLogParseExceptions(), ingestionSchema.getTuningConfig().getMaxParseExceptions(), ingestionSchema.getTuningConfig().getMaxSavedParseExceptions());
final boolean determineIntervals = ingestionSchema.getDataSchema().getGranularitySpec().inputIntervals().isEmpty();
final InputSource inputSource = ingestionSchema.getIOConfig().getNonNullInputSource(ingestionSchema.getDataSchema().getParser());
final File tmpDir = toolbox.getIndexingTmpDir();
ingestionState = IngestionState.DETERMINE_PARTITIONS;
// Initialize maxRowsPerSegment and maxTotalRows lazily
final IndexTuningConfig tuningConfig = ingestionSchema.tuningConfig;
final PartitionsSpec partitionsSpec = tuningConfig.getGivenOrDefaultPartitionsSpec();
final PartitionAnalysis partitionAnalysis = determineShardSpecs(toolbox, inputSource, tmpDir, partitionsSpec);
final List<Interval> allocateIntervals = new ArrayList<>(partitionAnalysis.getAllIntervalsToIndex());
final DataSchema dataSchema;
if (determineIntervals) {
final boolean gotLocks = determineLockGranularityAndTryLock(toolbox.getTaskActionClient(), allocateIntervals, ingestionSchema.getIOConfig());
if (!gotLocks) {
throw new ISE("Failed to get locks for intervals[%s]", allocateIntervals);
}
dataSchema = ingestionSchema.getDataSchema().withGranularitySpec(ingestionSchema.getDataSchema().getGranularitySpec().withIntervals(JodaUtils.condenseIntervals(allocateIntervals)));
} else {
dataSchema = ingestionSchema.getDataSchema();
}
ingestionState = IngestionState.BUILD_SEGMENTS;
return generateAndPublishSegments(toolbox, dataSchema, inputSource, tmpDir, partitionAnalysis);
} catch (Exception e) {
log.error(e, "Encountered exception in %s.", ingestionState);
errorMsg = Throwables.getStackTraceAsString(e);
toolbox.getTaskReportFileWriter().write(getId(), getTaskCompletionReports());
return TaskStatus.failure(getId(), errorMsg);
} finally {
toolbox.getChatHandlerProvider().unregister(getId());
}
}
Aggregations