use of org.apache.druid.segment.indexing.DataSchema in project druid by druid-io.
the class KafkaSupervisorTest method getDataSchema.
private static DataSchema getDataSchema(String dataSource) {
List<DimensionSchema> dimensions = new ArrayList<>();
dimensions.add(StringDimensionSchema.create("dim1"));
dimensions.add(StringDimensionSchema.create("dim2"));
return new DataSchema(dataSource, new TimestampSpec("timestamp", "iso", null), new DimensionsSpec(dimensions), new AggregatorFactory[] { new CountAggregatorFactory("rows") }, new UniformGranularitySpec(Granularities.HOUR, Granularities.NONE, ImmutableList.of()), null);
}
use of org.apache.druid.segment.indexing.DataSchema in project druid by druid-io.
the class KinesisSupervisorTest method testIsTaskCurrent.
@Test
public void testIsTaskCurrent() {
DateTime minMessageTime = DateTimes.nowUtc();
DateTime maxMessageTime = DateTimes.nowUtc().plus(10000);
KinesisSupervisor supervisor = getSupervisor(1, 1, true, "PT1H", new Period("P1D"), new Period("P1D"), false, 42, 42, dataSchema, tuningConfig);
supervisor.addTaskGroupToActivelyReadingTaskGroup(42, ImmutableMap.of(SHARD_ID1, "3"), Optional.of(minMessageTime), Optional.of(maxMessageTime), ImmutableSet.of("id1", "id2", "id3", "id4"), ImmutableSet.of());
DataSchema modifiedDataSchema = getDataSchema("some other datasource");
KinesisSupervisorTuningConfig modifiedTuningConfig = new KinesisSupervisorTuningConfig(null, 1000, null, null, 50000, null, new Period("P1Y"), new File("/test"), null, null, null, false, null, null, null, null, numThreads, TEST_CHAT_THREADS, TEST_CHAT_RETRIES, TEST_HTTP_TIMEOUT, TEST_SHUTDOWN_TIMEOUT, null, null, null, 5000, null, null, null, null, // This property is different from tuningConfig
42, null, null, null, null);
KinesisIndexTask taskFromStorage = createKinesisIndexTask("id1", 0, new SeekableStreamStartSequenceNumbers<>("stream", ImmutableMap.of(SHARD_ID1, "3"), ImmutableSet.of()), new SeekableStreamEndSequenceNumbers<>("stream", ImmutableMap.of(SHARD_ID1, KinesisSequenceNumber.NO_END_SEQUENCE_NUMBER)), minMessageTime, maxMessageTime, dataSchema);
KinesisIndexTask taskFromStorageMismatchedDataSchema = createKinesisIndexTask("id2", 0, new SeekableStreamStartSequenceNumbers<>("stream", ImmutableMap.of(SHARD_ID1, "3"), ImmutableSet.of()), new SeekableStreamEndSequenceNumbers<>("stream", ImmutableMap.of(SHARD_ID1, KinesisSequenceNumber.NO_END_SEQUENCE_NUMBER)), minMessageTime, maxMessageTime, modifiedDataSchema);
KinesisIndexTask taskFromStorageMismatchedTuningConfig = createKinesisIndexTask("id3", 0, new SeekableStreamStartSequenceNumbers<>("stream", ImmutableMap.of(SHARD_ID1, "3"), ImmutableSet.of()), new SeekableStreamEndSequenceNumbers<>("stream", ImmutableMap.of(SHARD_ID1, KinesisSequenceNumber.NO_END_SEQUENCE_NUMBER)), minMessageTime, maxMessageTime, dataSchema, modifiedTuningConfig);
KinesisIndexTask taskFromStorageMismatchedPartitionsWithTaskGroup = createKinesisIndexTask("id4", 0, new SeekableStreamStartSequenceNumbers<>("stream", ImmutableMap.of(SHARD_ID1, // this is the mismatch
"4"), ImmutableSet.of()), new SeekableStreamEndSequenceNumbers<>("stream", ImmutableMap.of(SHARD_ID1, KinesisSequenceNumber.NO_END_SEQUENCE_NUMBER)), minMessageTime, maxMessageTime, dataSchema);
EasyMock.expect(taskStorage.getTask("id1")).andReturn(Optional.of(taskFromStorage)).once();
EasyMock.expect(taskStorage.getTask("id2")).andReturn(Optional.of(taskFromStorageMismatchedDataSchema)).once();
EasyMock.expect(taskStorage.getTask("id3")).andReturn(Optional.of(taskFromStorageMismatchedTuningConfig)).once();
EasyMock.expect(taskStorage.getTask("id4")).andReturn(Optional.of(taskFromStorageMismatchedPartitionsWithTaskGroup)).once();
replayAll();
Assert.assertTrue(supervisor.isTaskCurrent(42, "id1"));
Assert.assertFalse(supervisor.isTaskCurrent(42, "id2"));
Assert.assertFalse(supervisor.isTaskCurrent(42, "id3"));
Assert.assertFalse(supervisor.isTaskCurrent(42, "id4"));
verifyAll();
}
use of org.apache.druid.segment.indexing.DataSchema in project hive by apache.
the class TestDruidRecordWriter method testWrite.
// Test is failing due to Guava dependency, Druid 0.13.0 should have less dependency on Guava
@Ignore
@Test
public void testWrite() throws IOException, SegmentLoadingException {
final String dataSourceName = "testDataSource";
final File segmentOutputDir = temporaryFolder.newFolder();
final File workingDir = temporaryFolder.newFolder();
Configuration config = new Configuration();
final InputRowParser inputRowParser = new MapInputRowParser(new TimeAndDimsParseSpec(new TimestampSpec(DruidConstants.DEFAULT_TIMESTAMP_COLUMN, "auto", null), new DimensionsSpec(ImmutableList.of(new StringDimensionSchema("host")), null, null)));
final Map<String, Object> parserMap = objectMapper.convertValue(inputRowParser, new TypeReference<Map<String, Object>>() {
});
DataSchema dataSchema = new DataSchema(dataSourceName, parserMap, new AggregatorFactory[] { new LongSumAggregatorFactory("visited_sum", "visited_sum"), new HyperUniquesAggregatorFactory("unique_hosts", "unique_hosts") }, new UniformGranularitySpec(Granularities.DAY, Granularities.NONE, ImmutableList.of(INTERVAL_FULL)), null, objectMapper);
IndexSpec indexSpec = new IndexSpec(new RoaringBitmapSerdeFactory(true), null, null, null);
RealtimeTuningConfig tuningConfig = new RealtimeTuningConfig(null, null, null, null, temporaryFolder.newFolder(), null, null, null, null, indexSpec, null, null, 0, 0, null, null, 0L, null, null);
LocalFileSystem localFileSystem = FileSystem.getLocal(config);
DataSegmentPusher dataSegmentPusher = new LocalDataSegmentPusher(new LocalDataSegmentPusherConfig() {
@Override
public File getStorageDirectory() {
return segmentOutputDir;
}
});
Path segmentDescriptorPath = new Path(workingDir.getAbsolutePath(), DruidStorageHandler.SEGMENTS_DESCRIPTOR_DIR_NAME);
DruidRecordWriter druidRecordWriter = new DruidRecordWriter(dataSchema, tuningConfig, dataSegmentPusher, 20, segmentDescriptorPath, localFileSystem);
List<DruidWritable> druidWritables = expectedRows.stream().map(input -> new DruidWritable(ImmutableMap.<String, Object>builder().putAll(input).put(Constants.DRUID_TIMESTAMP_GRANULARITY_COL_NAME, Granularities.DAY.bucketStart(new DateTime((long) input.get(DruidConstants.DEFAULT_TIMESTAMP_COLUMN))).getMillis()).build())).collect(Collectors.toList());
for (DruidWritable druidWritable : druidWritables) {
druidRecordWriter.write(druidWritable);
}
druidRecordWriter.close(false);
List<DataSegment> dataSegmentList = DruidStorageHandlerUtils.getCreatedSegments(segmentDescriptorPath, config);
Assert.assertEquals(1, dataSegmentList.size());
File tmpUnzippedSegmentDir = temporaryFolder.newFolder();
new LocalDataSegmentPuller().getSegmentFiles(dataSegmentList.get(0), tmpUnzippedSegmentDir);
final QueryableIndex queryableIndex = DruidStorageHandlerUtils.INDEX_IO.loadIndex(tmpUnzippedSegmentDir);
QueryableIndexStorageAdapter adapter = new QueryableIndexStorageAdapter(queryableIndex);
Firehose firehose = new IngestSegmentFirehose(ImmutableList.of(new WindowedStorageAdapter(adapter, adapter.getInterval())), null, ImmutableList.of("host"), ImmutableList.of("visited_sum", "unique_hosts"), null);
List<InputRow> rows = Lists.newArrayList();
while (firehose.hasMore()) {
rows.add(firehose.nextRow());
}
verifyRows(expectedRows, rows);
}
use of org.apache.druid.segment.indexing.DataSchema in project druid by druid-io.
the class MaterializedViewSupervisorTest method testCheckSegmentsAndSubmitTasks.
@Test
public void testCheckSegmentsAndSubmitTasks() throws IOException {
Set<DataSegment> baseSegments = Sets.newHashSet(new DataSegment("base", Intervals.of("2015-01-02T00Z/2015-01-03T00Z"), "2015-01-03", ImmutableMap.of(), ImmutableList.of("dim1", "dim2"), ImmutableList.of("m1"), new HashBasedNumberedShardSpec(0, 1, 0, 1, null, null, null), 9, 1024));
indexerMetadataStorageCoordinator.announceHistoricalSegments(baseSegments);
EasyMock.expect(taskMaster.getTaskQueue()).andReturn(Optional.of(taskQueue)).anyTimes();
EasyMock.expect(taskMaster.getTaskRunner()).andReturn(Optional.absent()).anyTimes();
EasyMock.expect(taskStorage.getActiveTasks()).andReturn(ImmutableList.of()).anyTimes();
EasyMock.expect(taskStorage.getStatus("test_task1")).andReturn(Optional.of(TaskStatus.failure("test_task1", "Dummy task status failure err message"))).anyTimes();
EasyMock.expect(taskStorage.getStatus("test_task2")).andReturn(Optional.of(TaskStatus.running("test_task2"))).anyTimes();
EasyMock.replay(taskStorage);
Pair<Map<Interval, HadoopIndexTask>, Map<Interval, String>> runningTasksPair = supervisor.getRunningTasks();
Map<Interval, HadoopIndexTask> runningTasks = runningTasksPair.lhs;
Map<Interval, String> runningVersion = runningTasksPair.rhs;
DataSchema dataSchema = new DataSchema("test_datasource", null, null, null, TransformSpec.NONE, objectMapper);
HadoopIOConfig hadoopIOConfig = new HadoopIOConfig(new HashMap<>(), null, null);
HadoopIngestionSpec spec = new HadoopIngestionSpec(dataSchema, hadoopIOConfig, null);
HadoopIndexTask task1 = new HadoopIndexTask("test_task1", spec, null, null, null, objectMapper, null, null, null);
runningTasks.put(Intervals.of("2015-01-01T00Z/2015-01-02T00Z"), task1);
runningVersion.put(Intervals.of("2015-01-01T00Z/2015-01-02T00Z"), "test_version1");
HadoopIndexTask task2 = new HadoopIndexTask("test_task2", spec, null, null, null, objectMapper, null, null, null);
runningTasks.put(Intervals.of("2015-01-02T00Z/2015-01-03T00Z"), task2);
runningVersion.put(Intervals.of("2015-01-02T00Z/2015-01-03T00Z"), "test_version2");
supervisor.checkSegmentsAndSubmitTasks();
Map<Interval, HadoopIndexTask> expectedRunningTasks = new HashMap<>();
Map<Interval, String> expectedRunningVersion = new HashMap<>();
expectedRunningTasks.put(Intervals.of("2015-01-02T00Z/2015-01-03T00Z"), task2);
expectedRunningVersion.put(Intervals.of("2015-01-02T00Z/2015-01-03T00Z"), "test_version2");
Assert.assertEquals(expectedRunningTasks, runningTasks);
Assert.assertEquals(expectedRunningVersion, runningVersion);
}
use of org.apache.druid.segment.indexing.DataSchema in project druid by druid-io.
the class IndexTask method generateAndPublishSegments.
/**
* This method reads input data row by row and adds the read row to a proper segment using {@link BaseAppenderatorDriver}.
* If there is no segment for the row, a new one is created. Segments can be published in the middle of reading inputs
* if {@link DynamicPartitionsSpec} is used and one of below conditions are satisfied.
*
* <ul>
* <li>
* If the number of rows in a segment exceeds {@link DynamicPartitionsSpec#maxRowsPerSegment}
* </li>
* <li>
* If the number of rows added to {@link BaseAppenderatorDriver} so far exceeds {@link DynamicPartitionsSpec#maxTotalRows}
* </li>
* </ul>
* <p>
* At the end of this method, all the remaining segments are published.
*
* @return the last {@link TaskStatus}
*/
private TaskStatus generateAndPublishSegments(final TaskToolbox toolbox, final DataSchema dataSchema, final InputSource inputSource, final File tmpDir, final PartitionAnalysis partitionAnalysis) throws IOException, InterruptedException {
final FireDepartment fireDepartmentForMetrics = new FireDepartment(dataSchema, new RealtimeIOConfig(null, null), null);
FireDepartmentMetrics buildSegmentsFireDepartmentMetrics = fireDepartmentForMetrics.getMetrics();
if (toolbox.getMonitorScheduler() != null) {
final TaskRealtimeMetricsMonitor metricsMonitor = TaskRealtimeMetricsMonitorBuilder.build(this, fireDepartmentForMetrics, buildSegmentsMeters);
toolbox.getMonitorScheduler().addMonitor(metricsMonitor);
}
final PartitionsSpec partitionsSpec = partitionAnalysis.getPartitionsSpec();
final IndexTuningConfig tuningConfig = ingestionSchema.getTuningConfig();
final long pushTimeout = tuningConfig.getPushTimeout();
final SegmentAllocatorForBatch segmentAllocator;
final SequenceNameFunction sequenceNameFunction;
switch(partitionsSpec.getType()) {
case HASH:
case RANGE:
final SegmentAllocatorForBatch localSegmentAllocator = SegmentAllocators.forNonLinearPartitioning(toolbox, getDataSource(), baseSequenceName, dataSchema.getGranularitySpec(), null, (CompletePartitionAnalysis) partitionAnalysis);
sequenceNameFunction = localSegmentAllocator.getSequenceNameFunction();
segmentAllocator = localSegmentAllocator;
break;
case LINEAR:
segmentAllocator = SegmentAllocators.forLinearPartitioning(toolbox, baseSequenceName, null, dataSchema, getTaskLockHelper(), ingestionSchema.getIOConfig().isAppendToExisting(), partitionAnalysis.getPartitionsSpec(), null);
sequenceNameFunction = segmentAllocator.getSequenceNameFunction();
break;
default:
throw new UOE("[%s] secondary partition type is not supported", partitionsSpec.getType());
}
Set<DataSegment> segmentsFoundForDrop = null;
if (ingestionSchema.getIOConfig().isDropExisting()) {
segmentsFoundForDrop = getUsedSegmentsWithinInterval(toolbox, getDataSource(), ingestionSchema.getDataSchema().getGranularitySpec().inputIntervals());
}
final TransactionalSegmentPublisher publisher = (segmentsToBeOverwritten, segmentsToDrop, segmentsToPublish, commitMetadata) -> toolbox.getTaskActionClient().submit(SegmentTransactionalInsertAction.overwriteAction(segmentsToBeOverwritten, segmentsToDrop, segmentsToPublish));
String effectiveId = getContextValue(CompactionTask.CTX_KEY_APPENDERATOR_TRACKING_TASK_ID, null);
if (effectiveId == null) {
effectiveId = getId();
}
final Appenderator appenderator = BatchAppenderators.newAppenderator(effectiveId, toolbox.getAppenderatorsManager(), buildSegmentsFireDepartmentMetrics, toolbox, dataSchema, tuningConfig, buildSegmentsMeters, buildSegmentsParseExceptionHandler, isUseMaxMemoryEstimates());
boolean exceptionOccurred = false;
try (final BatchAppenderatorDriver driver = BatchAppenderators.newDriver(appenderator, toolbox, segmentAllocator)) {
driver.startJob();
InputSourceProcessor.process(dataSchema, driver, partitionsSpec, inputSource, inputSource.needsFormat() ? getInputFormat(ingestionSchema) : null, tmpDir, sequenceNameFunction, new DefaultIndexTaskInputRowIteratorBuilder(), buildSegmentsMeters, buildSegmentsParseExceptionHandler, pushTimeout);
// If we use timeChunk lock, then we don't have to specify what segments will be overwritten because
// it will just overwrite all segments overlapped with the new segments.
final Set<DataSegment> inputSegments = getTaskLockHelper().isUseSegmentLock() ? getTaskLockHelper().getLockedExistingSegments() : null;
final boolean storeCompactionState = getContextValue(Tasks.STORE_COMPACTION_STATE_KEY, Tasks.DEFAULT_STORE_COMPACTION_STATE);
final Function<Set<DataSegment>, Set<DataSegment>> annotateFunction = compactionStateAnnotateFunction(storeCompactionState, toolbox, ingestionSchema);
// Probably we can publish atomicUpdateGroup along with segments.
final SegmentsAndCommitMetadata published = awaitPublish(driver.publishAll(inputSegments, segmentsFoundForDrop, publisher, annotateFunction), pushTimeout);
appenderator.close();
// for awaitSegmentAvailabilityTimeoutMillis
if (tuningConfig.getAwaitSegmentAvailabilityTimeoutMillis() > 0 && published != null) {
ingestionState = IngestionState.SEGMENT_AVAILABILITY_WAIT;
ArrayList<DataSegment> segmentsToWaitFor = new ArrayList<>(published.getSegments());
waitForSegmentAvailability(toolbox, segmentsToWaitFor, tuningConfig.getAwaitSegmentAvailabilityTimeoutMillis());
}
ingestionState = IngestionState.COMPLETED;
if (published == null) {
log.error("Failed to publish segments, aborting!");
errorMsg = "Failed to publish segments.";
toolbox.getTaskReportFileWriter().write(getId(), getTaskCompletionReports());
return TaskStatus.failure(getId(), errorMsg);
} else {
log.info("Processed[%,d] events, unparseable[%,d], thrownAway[%,d].", buildSegmentsMeters.getProcessed(), buildSegmentsMeters.getUnparseable(), buildSegmentsMeters.getThrownAway());
log.info("Published [%s] segments", published.getSegments().size());
log.debugSegments(published.getSegments(), "Published segments");
toolbox.getTaskReportFileWriter().write(getId(), getTaskCompletionReports());
return TaskStatus.success(getId());
}
} catch (TimeoutException | ExecutionException e) {
exceptionOccurred = true;
throw new RuntimeException(e);
} catch (Exception e) {
exceptionOccurred = true;
throw e;
} finally {
if (exceptionOccurred) {
appenderator.closeNow();
} else {
appenderator.close();
}
}
}
Aggregations