use of org.apache.druid.segment.indexing.DataSchema in project druid by druid-io.
the class CompactionTaskRunTest method testRunRegularIndexTaskWithIngestSegmentFirehose.
/**
* Run a regular index task that's equivalent to the compaction task in {@link #testRunWithDynamicPartitioning()},
* using {@link IngestSegmentFirehoseFactory}.
*
* This is not entirely CompactionTask related, but it's similar conceptually and it requires
* similar setup to what this test suite already has.
*
* It could be moved to a separate test class if needed.
*/
@Test
public void testRunRegularIndexTaskWithIngestSegmentFirehose() throws Exception {
runIndexTask();
IndexTask indexTask = new IndexTask(null, null, new IndexTask.IndexIngestionSpec(new DataSchema("test", getObjectMapper().convertValue(new StringInputRowParser(DEFAULT_PARSE_SPEC, null), Map.class), new AggregatorFactory[] { new LongSumAggregatorFactory("val", "val") }, new UniformGranularitySpec(Granularities.HOUR, Granularities.MINUTE, null), null, getObjectMapper()), new IndexTask.IndexIOConfig(new IngestSegmentFirehoseFactory(DATA_SOURCE, Intervals.of("2014-01-01/2014-01-02"), null, null, null, null, null, getIndexIO(), coordinatorClient, segmentCacheManagerFactory, RETRY_POLICY_FACTORY), false, false), IndexTaskTest.createTuningConfig(5000000, null, null, Long.MAX_VALUE, null, false, true)), null);
// This is a regular index so we need to explicitly add this context to store the CompactionState
indexTask.addToContext(Tasks.STORE_COMPACTION_STATE_KEY, true);
final Pair<TaskStatus, List<DataSegment>> resultPair = runTask(indexTask);
Assert.assertTrue(resultPair.lhs.isSuccess());
final List<DataSegment> segments = resultPair.rhs;
Assert.assertEquals(3, segments.size());
for (int i = 0; i < 3; i++) {
Assert.assertEquals(Intervals.of("2014-01-01T0%d:00:00/2014-01-01T0%d:00:00", i, i + 1), segments.get(i).getInterval());
Assert.assertEquals(getDefaultCompactionState(Granularities.HOUR, Granularities.MINUTE, ImmutableList.of()), segments.get(i).getLastCompactionState());
if (lockGranularity == LockGranularity.SEGMENT) {
Assert.assertEquals(new NumberedOverwriteShardSpec(32768, 0, 2, (short) 1, (short) 1), segments.get(i).getShardSpec());
} else {
Assert.assertEquals(new NumberedShardSpec(0, 1), segments.get(i).getShardSpec());
}
}
}
use of org.apache.druid.segment.indexing.DataSchema in project druid by druid-io.
the class KinesisSupervisorTest method getDataSchema.
private static DataSchema getDataSchema(String dataSource) {
List<DimensionSchema> dimensions = new ArrayList<>();
dimensions.add(StringDimensionSchema.create("dim1"));
dimensions.add(StringDimensionSchema.create("dim2"));
return new DataSchema(dataSource, new TimestampSpec("timestamp", "iso", null), new DimensionsSpec(dimensions), new AggregatorFactory[] { new CountAggregatorFactory("rows") }, new UniformGranularitySpec(Granularities.HOUR, Granularities.NONE, ImmutableList.of()), null);
}
use of org.apache.druid.segment.indexing.DataSchema in project druid by druid-io.
the class KafkaSupervisorTest method testIsTaskCurrent.
@Test
public void testIsTaskCurrent() {
DateTime minMessageTime = DateTimes.nowUtc();
DateTime maxMessageTime = DateTimes.nowUtc().plus(10000);
KafkaSupervisor supervisor = getSupervisor(2, 1, true, "PT1H", new Period("P1D"), new Period("P1D"), false, kafkaHost, dataSchema, new KafkaSupervisorTuningConfig(null, 1000, null, null, 50000, null, new Period("P1Y"), new File("/test"), null, null, null, false, null, false, null, numThreads, TEST_CHAT_THREADS, TEST_CHAT_RETRIES, TEST_HTTP_TIMEOUT, TEST_SHUTDOWN_TIMEOUT, null, null, null, null, null));
supervisor.addTaskGroupToActivelyReadingTaskGroup(42, ImmutableMap.of(0, 0L, 2, 0L), Optional.of(minMessageTime), Optional.of(maxMessageTime), ImmutableSet.of("id1", "id2", "id3", "id4"), ImmutableSet.of());
DataSchema modifiedDataSchema = getDataSchema("some other datasource");
KafkaSupervisorTuningConfig modifiedTuningConfig = new KafkaSupervisorTuningConfig(null, // This is different
42, null, null, 50000, null, new Period("P1Y"), new File("/test"), null, null, null, false, null, null, null, numThreads, TEST_CHAT_THREADS, TEST_CHAT_RETRIES, TEST_HTTP_TIMEOUT, TEST_SHUTDOWN_TIMEOUT, null, null, null, null, null);
KafkaIndexTask taskFromStorage = createKafkaIndexTask("id1", 0, new SeekableStreamStartSequenceNumbers<>("topic", ImmutableMap.of(0, 0L, 2, 0L), ImmutableSet.of()), new SeekableStreamEndSequenceNumbers<>("topic", ImmutableMap.of(0, Long.MAX_VALUE, 2, Long.MAX_VALUE)), minMessageTime, maxMessageTime, dataSchema, supervisor.getTuningConfig());
KafkaIndexTask taskFromStorageMismatchedDataSchema = createKafkaIndexTask("id2", 0, new SeekableStreamStartSequenceNumbers<>("topic", ImmutableMap.of(0, 0L, 2, 0L), ImmutableSet.of()), new SeekableStreamEndSequenceNumbers<>("topic", ImmutableMap.of(0, Long.MAX_VALUE, 2, Long.MAX_VALUE)), minMessageTime, maxMessageTime, modifiedDataSchema, supervisor.getTuningConfig());
KafkaIndexTask taskFromStorageMismatchedTuningConfig = createKafkaIndexTask("id3", 0, new SeekableStreamStartSequenceNumbers<>("topic", ImmutableMap.of(0, 0L, 2, 0L), ImmutableSet.of()), new SeekableStreamEndSequenceNumbers<>("topic", ImmutableMap.of(0, Long.MAX_VALUE, 2, Long.MAX_VALUE)), minMessageTime, maxMessageTime, dataSchema, modifiedTuningConfig);
KafkaIndexTask taskFromStorageMismatchedPartitionsWithTaskGroup = createKafkaIndexTask("id4", 0, new SeekableStreamStartSequenceNumbers<>("topic", ImmutableMap.of(0, 0L, 2, 6L), ImmutableSet.of()), new SeekableStreamEndSequenceNumbers<>("topic", ImmutableMap.of(0, Long.MAX_VALUE, 2, Long.MAX_VALUE)), minMessageTime, maxMessageTime, dataSchema, supervisor.getTuningConfig());
EasyMock.expect(taskStorage.getTask("id1")).andReturn(Optional.of(taskFromStorage)).once();
EasyMock.expect(taskStorage.getTask("id2")).andReturn(Optional.of(taskFromStorageMismatchedDataSchema)).once();
EasyMock.expect(taskStorage.getTask("id3")).andReturn(Optional.of(taskFromStorageMismatchedTuningConfig)).once();
EasyMock.expect(taskStorage.getTask("id4")).andReturn(Optional.of(taskFromStorageMismatchedPartitionsWithTaskGroup)).once();
replayAll();
Assert.assertTrue(supervisor.isTaskCurrent(42, "id1"));
Assert.assertFalse(supervisor.isTaskCurrent(42, "id2"));
Assert.assertFalse(supervisor.isTaskCurrent(42, "id3"));
Assert.assertFalse(supervisor.isTaskCurrent(42, "id4"));
verifyAll();
}
use of org.apache.druid.segment.indexing.DataSchema in project druid by druid-io.
the class AppenderatorDriverRealtimeIndexTask method run.
@Override
public TaskStatus run(final TaskToolbox toolbox) {
runThread = Thread.currentThread();
authorizerMapper = toolbox.getAuthorizerMapper();
rowIngestionMeters = toolbox.getRowIngestionMetersFactory().createRowIngestionMeters();
parseExceptionHandler = new ParseExceptionHandler(rowIngestionMeters, spec.getTuningConfig().isLogParseExceptions(), spec.getTuningConfig().getMaxParseExceptions(), spec.getTuningConfig().getMaxSavedParseExceptions());
setupTimeoutAlert();
DataSchema dataSchema = spec.getDataSchema();
RealtimeAppenderatorTuningConfig tuningConfig = spec.getTuningConfig().withBasePersistDirectory(toolbox.getPersistDir());
final FireDepartment fireDepartmentForMetrics = new FireDepartment(dataSchema, new RealtimeIOConfig(null, null), null);
final TaskRealtimeMetricsMonitor metricsMonitor = TaskRealtimeMetricsMonitorBuilder.build(this, fireDepartmentForMetrics, rowIngestionMeters);
this.metrics = fireDepartmentForMetrics.getMetrics();
final Supplier<Committer> committerSupplier = Committers.nilSupplier();
DiscoveryDruidNode discoveryDruidNode = createDiscoveryDruidNode(toolbox);
appenderator = newAppenderator(dataSchema, tuningConfig, metrics, toolbox);
TaskLockType lockType = getContextValue(Tasks.USE_SHARED_LOCK, false) ? TaskLockType.SHARED : TaskLockType.EXCLUSIVE;
StreamAppenderatorDriver driver = newDriver(dataSchema, appenderator, toolbox, metrics, lockType);
try {
log.debug("Found chat handler of class[%s]", toolbox.getChatHandlerProvider().getClass().getName());
toolbox.getChatHandlerProvider().register(getId(), this, false);
if (toolbox.getAppenderatorsManager().shouldTaskMakeNodeAnnouncements()) {
toolbox.getDataSegmentServerAnnouncer().announce();
toolbox.getDruidNodeAnnouncer().announce(discoveryDruidNode);
}
driver.startJob(segmentId -> {
try {
if (lockGranularity == LockGranularity.SEGMENT) {
return toolbox.getTaskActionClient().submit(new SegmentLockAcquireAction(TaskLockType.EXCLUSIVE, segmentId.getInterval(), segmentId.getVersion(), segmentId.getShardSpec().getPartitionNum(), 1000L)).isOk();
} else {
final TaskLock lock = toolbox.getTaskActionClient().submit(new TimeChunkLockAcquireAction(TaskLockType.EXCLUSIVE, segmentId.getInterval(), 1000L));
if (lock == null) {
return false;
}
if (lock.isRevoked()) {
throw new ISE(StringUtils.format("Lock for interval [%s] was revoked.", segmentId.getInterval()));
}
return true;
}
} catch (IOException e) {
throw new RuntimeException(e);
}
});
// Set up metrics emission
toolbox.addMonitor(metricsMonitor);
// Delay firehose connection to avoid claiming input resources while the plumber is starting up.
final FirehoseFactory firehoseFactory = spec.getIOConfig().getFirehoseFactory();
final boolean firehoseDrainableByClosing = isFirehoseDrainableByClosing(firehoseFactory);
int sequenceNumber = 0;
String sequenceName = makeSequenceName(getId(), sequenceNumber);
final TransactionalSegmentPublisher publisher = (mustBeNullOrEmptyOverwriteSegments, mustBeNullOrEmptyDropSegments, segments, commitMetadata) -> {
if (mustBeNullOrEmptyOverwriteSegments != null && !mustBeNullOrEmptyOverwriteSegments.isEmpty()) {
throw new ISE("Stream ingestion task unexpectedly attempted to overwrite segments: %s", SegmentUtils.commaSeparatedIdentifiers(mustBeNullOrEmptyOverwriteSegments));
}
if (mustBeNullOrEmptyDropSegments != null && !mustBeNullOrEmptyDropSegments.isEmpty()) {
throw new ISE("Stream ingestion task unexpectedly attempted to drop segments: %s", SegmentUtils.commaSeparatedIdentifiers(mustBeNullOrEmptyDropSegments));
}
final SegmentTransactionalInsertAction action = SegmentTransactionalInsertAction.appendAction(segments, null, null);
return toolbox.getTaskActionClient().submit(action);
};
// Skip connecting firehose if we've been stopped before we got started.
synchronized (this) {
if (!gracefullyStopped) {
firehose = firehoseFactory.connect(Preconditions.checkNotNull(spec.getDataSchema().getParser(), "inputRowParser"), toolbox.getIndexingTmpDir());
}
}
ingestionState = IngestionState.BUILD_SEGMENTS;
// Time to read data!
while (!gracefullyStopped && firehoseDrainableByClosing && firehose.hasMore()) {
try {
InputRow inputRow = firehose.nextRow();
if (inputRow == null) {
log.debug("Discarded null row, considering thrownAway.");
rowIngestionMeters.incrementThrownAway();
} else {
AppenderatorDriverAddResult addResult = driver.add(inputRow, sequenceName, committerSupplier);
if (addResult.isOk()) {
final boolean isPushRequired = addResult.isPushRequired(tuningConfig.getPartitionsSpec().getMaxRowsPerSegment(), tuningConfig.getPartitionsSpec().getMaxTotalRowsOr(DynamicPartitionsSpec.DEFAULT_MAX_TOTAL_ROWS));
if (isPushRequired) {
publishSegments(driver, publisher, committerSupplier, sequenceName);
sequenceNumber++;
sequenceName = makeSequenceName(getId(), sequenceNumber);
}
} else {
// If we allow continuing, then consider blacklisting the interval for a while to avoid constant checks.
throw new ISE("Could not allocate segment for row with timestamp[%s]", inputRow.getTimestamp());
}
}
} catch (ParseException e) {
handleParseException(e);
}
}
ingestionState = IngestionState.COMPLETED;
if (!gracefullyStopped) {
synchronized (this) {
if (gracefullyStopped) {
// Someone called stopGracefully after we checked the flag. That's okay, just stop now.
log.info("Gracefully stopping.");
} else {
finishingJob = true;
}
}
if (finishingJob) {
log.info("Finishing job...");
// Publish any remaining segments
publishSegments(driver, publisher, committerSupplier, sequenceName);
waitForSegmentPublishAndHandoff(tuningConfig.getPublishAndHandoffTimeout());
}
} else if (firehose != null) {
log.info("Task was gracefully stopped, will persist data before exiting");
persistAndWait(driver, committerSupplier.get());
}
} catch (Throwable e) {
log.makeAlert(e, "Exception aborted realtime processing[%s]", dataSchema.getDataSource()).emit();
errorMsg = Throwables.getStackTraceAsString(e);
toolbox.getTaskReportFileWriter().write(getId(), getTaskCompletionReports());
return TaskStatus.failure(getId(), errorMsg);
} finally {
toolbox.getChatHandlerProvider().unregister(getId());
CloseableUtils.closeAndSuppressExceptions(firehose, e -> log.warn("Failed to close Firehose"));
appenderator.close();
CloseableUtils.closeAndSuppressExceptions(driver, e -> log.warn("Failed to close AppenderatorDriver"));
toolbox.removeMonitor(metricsMonitor);
if (toolbox.getAppenderatorsManager().shouldTaskMakeNodeAnnouncements()) {
toolbox.getDataSegmentServerAnnouncer().unannounce();
toolbox.getDruidNodeAnnouncer().unannounce(discoveryDruidNode);
}
}
log.info("Job done!");
toolbox.getTaskReportFileWriter().write(getId(), getTaskCompletionReports());
return TaskStatus.success(getId());
}
use of org.apache.druid.segment.indexing.DataSchema in project druid by druid-io.
the class GranularityPathSpecTest method testAddInputPath.
@Test
public void testAddInputPath() throws Exception {
UserGroupInformation.setLoginUser(UserGroupInformation.createUserForTesting("test", new String[] { "testGroup" }));
HadoopIngestionSpec spec = new HadoopIngestionSpec(new DataSchema("foo", null, new AggregatorFactory[0], new UniformGranularitySpec(Granularities.DAY, Granularities.MINUTE, ImmutableList.of(Intervals.of("2015-11-06T00:00Z/2015-11-07T00:00Z"))), null, jsonMapper), new HadoopIOConfig(null, null, null), DEFAULT_TUNING_CONFIG);
granularityPathSpec.setDataGranularity(Granularities.HOUR);
granularityPathSpec.setFilePattern(".*");
granularityPathSpec.setInputFormat(TextInputFormat.class);
Job job = Job.getInstance();
String formatStr = "file:%s/%s;org.apache.hadoop.mapreduce.lib.input.TextInputFormat";
testFolder.newFolder("test", "y=2015", "m=11", "d=06", "H=00");
testFolder.newFolder("test", "y=2015", "m=11", "d=06", "H=02");
testFolder.newFolder("test", "y=2015", "m=11", "d=06", "H=05");
testFolder.newFile("test/y=2015/m=11/d=06/H=00/file1");
testFolder.newFile("test/y=2015/m=11/d=06/H=02/file2");
testFolder.newFile("test/y=2015/m=11/d=06/H=05/file3");
testFolder.newFile("test/y=2015/m=11/d=06/H=05/file4");
granularityPathSpec.setInputPath(testFolder.getRoot().getPath() + "/test");
granularityPathSpec.addInputPaths(HadoopDruidIndexerConfig.fromSpec(spec), job);
String actual = job.getConfiguration().get("mapreduce.input.multipleinputs.dir.formats");
String expected = Joiner.on(",").join(Lists.newArrayList(StringUtils.format(formatStr, testFolder.getRoot(), "test/y=2015/m=11/d=06/H=00/file1"), StringUtils.format(formatStr, testFolder.getRoot(), "test/y=2015/m=11/d=06/H=02/file2"), StringUtils.format(formatStr, testFolder.getRoot(), "test/y=2015/m=11/d=06/H=05/file3"), StringUtils.format(formatStr, testFolder.getRoot(), "test/y=2015/m=11/d=06/H=05/file4")));
Assert.assertEquals("Did not find expected input paths", expected, actual);
}
Aggregations