use of org.apache.druid.data.input.Committer in project druid by druid-io.
the class BatchAppenderator method add.
@Override
public AppenderatorAddResult add(final SegmentIdWithShardSpec identifier, final InputRow row, @Nullable final Supplier<Committer> committerSupplier, final boolean allowIncrementalPersists) throws IndexSizeExceededException, SegmentNotWritableException {
throwPersistErrorIfExists();
Preconditions.checkArgument(committerSupplier == null, "Batch appenderator does not need a committer!");
Preconditions.checkArgument(allowIncrementalPersists, "Batch appenderator should always allow incremental persists!");
if (!identifier.getDataSource().equals(schema.getDataSource())) {
throw new IAE("Expected dataSource[%s] but was asked to insert row for dataSource[%s]?!", schema.getDataSource(), identifier.getDataSource());
}
final Sink sink = getOrCreateSink(identifier);
metrics.reportMessageMaxTimestamp(row.getTimestampFromEpoch());
final int sinkRowsInMemoryBeforeAdd = sink.getNumRowsInMemory();
final int sinkRowsInMemoryAfterAdd;
final long bytesInMemoryBeforeAdd = sink.getBytesInMemory();
final long bytesInMemoryAfterAdd;
final IncrementalIndexAddResult addResult;
try {
// allow incrememtal persis is always true for batch
addResult = sink.add(row, false);
sinkRowsInMemoryAfterAdd = addResult.getRowCount();
bytesInMemoryAfterAdd = addResult.getBytesInMemory();
} catch (IndexSizeExceededException e) {
// Uh oh, we can't do anything about this! We can't persist (commit metadata would be out of sync) and we
// can't add the row (it just failed). This should never actually happen, though, because we check
// sink.canAddRow after returning from add.
log.error(e, "Sink for segment[%s] was unexpectedly full!", identifier);
throw e;
}
if (sinkRowsInMemoryAfterAdd < 0) {
throw new SegmentNotWritableException("Attempt to add row to swapped-out sink for segment[%s].", identifier);
}
if (addResult.isRowAdded()) {
rowIngestionMeters.incrementProcessed();
} else if (addResult.hasParseException()) {
parseExceptionHandler.handle(addResult.getParseException());
}
final int numAddedRows = sinkRowsInMemoryAfterAdd - sinkRowsInMemoryBeforeAdd;
rowsCurrentlyInMemory += numAddedRows;
bytesCurrentlyInMemory += (bytesInMemoryAfterAdd - bytesInMemoryBeforeAdd);
totalRows += numAddedRows;
sinksMetadata.computeIfAbsent(identifier, unused -> new SinkMetadata()).addRows(numAddedRows);
boolean persist = false;
List<String> persistReasons = new ArrayList<>();
if (!sink.canAppendRow()) {
persist = true;
persistReasons.add("No more rows can be appended to sink");
}
if (rowsCurrentlyInMemory >= tuningConfig.getMaxRowsInMemory()) {
persist = true;
persistReasons.add(StringUtils.format("rowsCurrentlyInMemory[%d] is greater than maxRowsInMemory[%d]", rowsCurrentlyInMemory, tuningConfig.getMaxRowsInMemory()));
}
if (bytesCurrentlyInMemory >= maxBytesTuningConfig) {
persist = true;
persistReasons.add(StringUtils.format("bytesCurrentlyInMemory[%d] is greater than maxBytesInMemory[%d]", bytesCurrentlyInMemory, maxBytesTuningConfig));
}
if (persist) {
// persistAll clears rowsCurrentlyInMemory, no need to update it.
log.info("Incremental persist to disk because %s.", String.join(",", persistReasons));
long bytesToBePersisted = 0L;
for (Map.Entry<SegmentIdWithShardSpec, Sink> entry : sinks.entrySet()) {
final Sink sinkEntry = entry.getValue();
if (sinkEntry != null) {
bytesToBePersisted += sinkEntry.getBytesInMemory();
if (sinkEntry.swappable()) {
// Code for batch no longer memory maps hydrants, but they still take memory...
int memoryStillInUse = calculateMemoryUsedByHydrant();
bytesCurrentlyInMemory += memoryStillInUse;
}
}
}
if (!skipBytesInMemoryOverheadCheck && bytesCurrentlyInMemory - bytesToBePersisted > maxBytesTuningConfig) {
// We are still over maxBytesTuningConfig even after persisting.
// This means that we ran out of all available memory to ingest (due to overheads created as part of ingestion)
final String alertMessage = StringUtils.format("Task has exceeded safe estimated heap usage limits, failing " + "(numSinks: [%d] numHydrantsAcrossAllSinks: [%d] totalRows: [%d])" + "(bytesCurrentlyInMemory: [%d] - bytesToBePersisted: [%d] > maxBytesTuningConfig: [%d])", sinks.size(), sinks.values().stream().mapToInt(Iterables::size).sum(), getTotalRowCount(), bytesCurrentlyInMemory, bytesToBePersisted, maxBytesTuningConfig);
final String errorMessage = StringUtils.format("%s.\nThis can occur when the overhead from too many intermediary segment persists becomes to " + "great to have enough space to process additional input rows. This check, along with metering the overhead " + "of these objects to factor into the 'maxBytesInMemory' computation, can be disabled by setting " + "'skipBytesInMemoryOverheadCheck' to 'true' (note that doing so might allow the task to naturally encounter " + "a 'java.lang.OutOfMemoryError'). Alternatively, 'maxBytesInMemory' can be increased which will cause an " + "increase in heap footprint, but will allow for more intermediary segment persists to occur before " + "reaching this condition.", alertMessage);
log.makeAlert(alertMessage).addData("dataSource", schema.getDataSource()).emit();
throw new RuntimeException(errorMessage);
}
Futures.addCallback(persistAll(null), new FutureCallback<Object>() {
@Override
public void onSuccess(@Nullable Object result) {
// do nothing
}
@Override
public void onFailure(Throwable t) {
persistError = t;
}
});
}
return new AppenderatorAddResult(identifier, sinksMetadata.get(identifier).numRowsInSegment, false);
}
use of org.apache.druid.data.input.Committer in project druid by druid-io.
the class BatchAppenderator method persistAll.
@Override
public ListenableFuture<Object> persistAll(@Nullable final Committer committer) {
throwPersistErrorIfExists();
if (committer != null) {
throw new ISE("committer must be null for BatchAppenderator");
}
// Get ready to persist all sinks:
final Map<SegmentIdWithShardSpec, Sink> sinksToPersist = swapSinks();
final Stopwatch runExecStopwatch = Stopwatch.createStarted();
ListenableFuture<Object> future = persistExecutor.submit(() -> {
log.info("Spawning intermediate persist");
// figure out hydrants (indices) to persist:
final List<Pair<FireHydrant, SegmentIdWithShardSpec>> indexesToPersist = new ArrayList<>();
int numPersistedRows = 0;
long bytesPersisted = 0;
int totalHydrantsCount = 0;
final long totalSinks = sinksToPersist.size();
for (Map.Entry<SegmentIdWithShardSpec, Sink> entry : sinksToPersist.entrySet()) {
final SegmentIdWithShardSpec identifier = entry.getKey();
final Sink sink = entry.getValue();
if (sink == null) {
throw new ISE("No sink for identifier: %s", identifier);
}
final List<FireHydrant> hydrants = Lists.newArrayList(sink);
// Since everytime we persist we also get rid of the in-memory references to sink & hydrants
// the invariant of exactly one, always swappable, sink with exactly one unpersisted hydrant must hold
int totalHydrantsForSink = hydrants.size();
if (totalHydrantsForSink != 1) {
throw new ISE("There should be only one hydrant for identifier[%s] but there are[%s]", identifier, totalHydrantsForSink);
}
totalHydrantsCount++;
numPersistedRows += sink.getNumRowsInMemory();
bytesPersisted += sink.getBytesInMemory();
if (!sink.swappable()) {
throw new ISE("Sink is not swappable![%s]", identifier);
}
indexesToPersist.add(Pair.of(sink.swap(), identifier));
}
if (indexesToPersist.isEmpty()) {
log.info("No indexes will be persisted");
}
final Stopwatch persistStopwatch = Stopwatch.createStarted();
try {
for (Pair<FireHydrant, SegmentIdWithShardSpec> pair : indexesToPersist) {
metrics.incrementRowOutputCount(persistHydrant(pair.lhs, pair.rhs));
}
log.info("Persisted in-memory data for segments: %s", indexesToPersist.stream().filter(itp -> itp.rhs != null).map(itp -> itp.rhs.asSegmentId().toString()).distinct().collect(Collectors.joining(", ")));
log.info("Persisted stats: processed rows: [%d], persisted rows[%d], persisted sinks: [%d], persisted fireHydrants (across sinks): [%d]", rowIngestionMeters.getProcessed(), numPersistedRows, totalSinks, totalHydrantsCount);
// note that we do not need to reset sinks metadata since we did it at the start...
} catch (Exception e) {
metrics.incrementFailedPersists();
throw e;
} finally {
metrics.incrementNumPersists();
long persistMillis = persistStopwatch.elapsed(TimeUnit.MILLISECONDS);
metrics.incrementPersistTimeMillis(persistMillis);
persistStopwatch.stop();
// make sure no push can start while persisting:
log.info("Persisted rows[%,d] and bytes[%,d] and removed all sinks & hydrants from memory in[%d] millis", numPersistedRows, bytesPersisted, persistMillis);
log.info("Persist is done.");
}
return null;
});
final long startDelay = runExecStopwatch.elapsed(TimeUnit.MILLISECONDS);
metrics.incrementPersistBackPressureMillis(startDelay);
if (startDelay > PERSIST_WARN_DELAY) {
log.warn("Ingestion was throttled for [%,d] millis because persists were pending.", startDelay);
}
runExecStopwatch.stop();
return future;
}
use of org.apache.druid.data.input.Committer in project druid by druid-io.
the class StreamAppenderator method push.
@Override
public ListenableFuture<SegmentsAndCommitMetadata> push(final Collection<SegmentIdWithShardSpec> identifiers, @Nullable final Committer committer, final boolean useUniquePath) {
final Map<SegmentIdWithShardSpec, Sink> theSinks = new HashMap<>();
AtomicLong pushedHydrantsCount = new AtomicLong();
for (final SegmentIdWithShardSpec identifier : identifiers) {
final Sink sink = sinks.get(identifier);
if (sink == null) {
throw new ISE("No sink for identifier: %s", identifier);
}
theSinks.put(identifier, sink);
if (sink.finishWriting()) {
totalRows.addAndGet(-sink.getNumRows());
}
// count hydrants for stats:
pushedHydrantsCount.addAndGet(Iterables.size(sink));
}
return Futures.transform(// segments.
persistAll(committer), (Function<Object, SegmentsAndCommitMetadata>) commitMetadata -> {
final List<DataSegment> dataSegments = new ArrayList<>();
log.info("Preparing to push (stats): processed rows: [%d], sinks: [%d], fireHydrants (across sinks): [%d]", rowIngestionMeters.getProcessed(), theSinks.size(), pushedHydrantsCount.get());
log.debug("Building and pushing segments: %s", theSinks.keySet().stream().map(SegmentIdWithShardSpec::toString).collect(Collectors.joining(", ")));
for (Map.Entry<SegmentIdWithShardSpec, Sink> entry : theSinks.entrySet()) {
if (droppingSinks.contains(entry.getKey())) {
log.warn("Skipping push of currently-dropping sink[%s]", entry.getKey());
continue;
}
final DataSegment dataSegment = mergeAndPush(entry.getKey(), entry.getValue(), useUniquePath);
if (dataSegment != null) {
dataSegments.add(dataSegment);
} else {
log.warn("mergeAndPush[%s] returned null, skipping.", entry.getKey());
}
}
log.info("Push complete...");
return new SegmentsAndCommitMetadata(dataSegments, commitMetadata);
}, pushExecutor);
}
use of org.apache.druid.data.input.Committer in project druid by druid-io.
the class RealtimePlumberSchoolTest method testPersistHydrantGapsHelper.
private void testPersistHydrantGapsHelper(final Object commitMetadata) throws Exception {
Interval testInterval = new Interval(DateTimes.of("1970-01-01"), DateTimes.of("1971-01-01"));
RealtimePlumber plumber2 = (RealtimePlumber) realtimePlumberSchool.findPlumber(schema2, tuningConfig, metrics);
Sink sink = new Sink(testInterval, schema2, tuningConfig.getShardSpec(), DateTimes.of("2014-12-01T12:34:56.789").toString(), tuningConfig.getAppendableIndexSpec(), tuningConfig.getMaxRowsInMemory(), tuningConfig.getMaxBytesInMemoryOrDefault(), true, tuningConfig.getDedupColumn());
plumber2.getSinks().put(0L, sink);
Assert.assertNull(plumber2.startJob());
final CountDownLatch doneSignal = new CountDownLatch(1);
final Committer committer = new Committer() {
@Override
public Object getMetadata() {
return commitMetadata;
}
@Override
public void run() {
doneSignal.countDown();
}
};
plumber2.add(getTestInputRow("1970-01-01"), Suppliers.ofInstance(committer));
plumber2.add(getTestInputRow("1970-02-01"), Suppliers.ofInstance(committer));
plumber2.add(getTestInputRow("1970-03-01"), Suppliers.ofInstance(committer));
plumber2.add(getTestInputRow("1970-04-01"), Suppliers.ofInstance(committer));
plumber2.add(getTestInputRow("1970-05-01"), Suppliers.ofInstance(committer));
plumber2.persist(committer);
doneSignal.await();
plumber2.getSinks().clear();
plumber2.finishJob();
File persistDir = plumber2.computePersistDir(schema2, testInterval);
/* Check that all hydrants were persisted */
for (int i = 0; i < 5; i++) {
Assert.assertTrue(new File(persistDir, String.valueOf(i)).exists());
}
/* Create some gaps in the persisted hydrants and reload */
FileUtils.deleteDirectory(new File(persistDir, "1"));
FileUtils.deleteDirectory(new File(persistDir, "3"));
RealtimePlumber restoredPlumber = (RealtimePlumber) realtimePlumberSchool.findPlumber(schema2, tuningConfig, metrics);
restoredPlumber.bootstrapSinksFromDisk();
Map<Long, Sink> sinks = restoredPlumber.getSinks();
Assert.assertEquals(1, sinks.size());
List<FireHydrant> hydrants = Lists.newArrayList(sinks.get(new Long(0)));
DateTime startTime = DateTimes.of("1970-01-01T00:00:00.000Z");
Interval expectedInterval = new Interval(startTime, DateTimes.of("1971-01-01T00:00:00.000Z"));
Assert.assertEquals(0, hydrants.get(0).getCount());
Assert.assertEquals(expectedInterval, hydrants.get(0).getSegmentDataInterval());
Assert.assertEquals(2, hydrants.get(1).getCount());
Assert.assertEquals(expectedInterval, hydrants.get(1).getSegmentDataInterval());
Assert.assertEquals(4, hydrants.get(2).getCount());
Assert.assertEquals(expectedInterval, hydrants.get(2).getSegmentDataInterval());
/* Delete all the hydrants and reload, no sink should be created */
FileUtils.deleteDirectory(new File(persistDir, "0"));
FileUtils.deleteDirectory(new File(persistDir, "2"));
FileUtils.deleteDirectory(new File(persistDir, "4"));
RealtimePlumber restoredPlumber2 = (RealtimePlumber) realtimePlumberSchool.findPlumber(schema2, tuningConfig, metrics);
restoredPlumber2.bootstrapSinksFromDisk();
Assert.assertEquals(0, restoredPlumber2.getSinks().size());
}
use of org.apache.druid.data.input.Committer in project druid by druid-io.
the class RealtimePlumberSchoolTest method testDimOrderInheritanceHelper.
private void testDimOrderInheritanceHelper(final Object commitMetadata) throws Exception {
List<List<String>> expectedDims = ImmutableList.of(ImmutableList.of("dimD"), ImmutableList.of("dimC"), ImmutableList.of("dimA"), ImmutableList.of("dimB"), ImmutableList.of("dimE"), ImmutableList.of("dimD", "dimC", "dimA", "dimB", "dimE"));
QueryableIndex qindex;
FireHydrant hydrant;
Map<Long, Sink> sinks;
RealtimePlumber plumber = (RealtimePlumber) realtimePlumberSchool.findPlumber(schema2, tuningConfig, metrics);
Assert.assertNull(plumber.startJob());
final CountDownLatch doneSignal = new CountDownLatch(1);
final Committer committer = new Committer() {
@Override
public Object getMetadata() {
return commitMetadata;
}
@Override
public void run() {
doneSignal.countDown();
}
};
plumber.add(getTestInputRowFull("1970-01-01", ImmutableList.of("dimD"), ImmutableList.of("1")), Suppliers.ofInstance(committer));
plumber.add(getTestInputRowFull("1970-01-01", ImmutableList.of("dimC"), ImmutableList.of("1")), Suppliers.ofInstance(committer));
plumber.add(getTestInputRowFull("1970-01-01", ImmutableList.of("dimA"), ImmutableList.of("1")), Suppliers.ofInstance(committer));
plumber.add(getTestInputRowFull("1970-01-01", ImmutableList.of("dimB"), ImmutableList.of("1")), Suppliers.ofInstance(committer));
plumber.add(getTestInputRowFull("1970-01-01", ImmutableList.of("dimE"), ImmutableList.of("1")), Suppliers.ofInstance(committer));
plumber.add(getTestInputRowFull("1970-01-01", ImmutableList.of("dimA", "dimB", "dimC", "dimD", "dimE"), ImmutableList.of("1")), Suppliers.ofInstance(committer));
plumber.persist(committer);
doneSignal.await();
plumber.getSinks().clear();
plumber.finishJob();
RealtimePlumber restoredPlumber = (RealtimePlumber) realtimePlumberSchool.findPlumber(schema2, tuningConfig, metrics);
restoredPlumber.bootstrapSinksFromDisk();
sinks = restoredPlumber.getSinks();
Assert.assertEquals(1, sinks.size());
List<FireHydrant> hydrants = Lists.newArrayList(sinks.get(0L));
for (int i = 0; i < hydrants.size(); i++) {
hydrant = hydrants.get(i);
ReferenceCountingSegment segment = hydrant.getIncrementedSegment();
try {
qindex = segment.asQueryableIndex();
Assert.assertEquals(i, hydrant.getCount());
Assert.assertEquals(expectedDims.get(i), ImmutableList.copyOf(qindex.getAvailableDimensions()));
} finally {
segment.decrement();
}
}
}
Aggregations