use of org.apache.druid.segment.indexing.DataSchema in project druid by druid-io.
the class SinglePhaseParallelIndexingTest method testIngestBothExplicitAndImplicitDims.
@Test
public void testIngestBothExplicitAndImplicitDims() throws IOException {
final Interval interval = Intervals.of("2017-12/P1M");
for (int i = 0; i < 5; i++) {
try (final Writer writer = Files.newBufferedWriter(new File(inputDir, "test_" + i + ".json").toPath(), StandardCharsets.UTF_8)) {
writer.write(getObjectMapper().writeValueAsString(ImmutableMap.of("ts", StringUtils.format("2017-12-%d", 24 + i), "implicitDim", "implicit_" + i, "explicitDim", "explicit_" + i)));
writer.write(getObjectMapper().writeValueAsString(ImmutableMap.of("ts", StringUtils.format("2017-12-%d", 25 + i), "implicitDim", "implicit_" + i, "explicitDim", "explicit_" + i)));
}
}
final ParallelIndexSupervisorTask task = new ParallelIndexSupervisorTask(null, null, null, new ParallelIndexIngestionSpec(new DataSchema("dataSource", DEFAULT_TIMESTAMP_SPEC, DimensionsSpec.builder().setDefaultSchemaDimensions(ImmutableList.of("ts", "explicitDim")).setIncludeAllDimensions(true).build(), new AggregatorFactory[] { new CountAggregatorFactory("cnt") }, new UniformGranularitySpec(Granularities.DAY, Granularities.MINUTE, Collections.singletonList(interval)), null), new ParallelIndexIOConfig(null, new SettableSplittableLocalInputSource(inputDir, "*.json", true), new JsonInputFormat(new JSONPathSpec(true, null), null, null), false, null), AbstractParallelIndexSupervisorTaskTest.DEFAULT_TUNING_CONFIG_FOR_PARALLEL_INDEXING), null);
task.addToContext(Tasks.FORCE_TIME_CHUNK_LOCK_KEY, lockGranularity == LockGranularity.TIME_CHUNK);
Assert.assertEquals(TaskState.SUCCESS, getIndexingServiceClient().runAndWait(task).getStatusCode());
Set<DataSegment> segments = getIndexingServiceClient().getPublishedSegments(task);
for (DataSegment segment : segments) {
Assert.assertEquals(ImmutableList.of("ts", "explicitDim", "implicitDim"), segment.getDimensions());
}
}
use of org.apache.druid.segment.indexing.DataSchema in project druid by druid-io.
the class SinglePhaseSubTask method generateAndPushSegments.
/**
* This method reads input data row by row and adds the read row to a proper segment using {@link BaseAppenderatorDriver}.
* If there is no segment for the row, a new one is created. Segments can be published in the middle of reading inputs
* if one of below conditions are satisfied.
*
* <ul>
* <li>
* If the number of rows in a segment exceeds {@link DynamicPartitionsSpec#maxRowsPerSegment}
* </li>
* <li>
* If the number of rows added to {@link BaseAppenderatorDriver} so far exceeds {@link DynamicPartitionsSpec#maxTotalRows}
* </li>
* </ul>
* <p>
* At the end of this method, all the remaining segments are published.
*
* @return true if generated segments are successfully published, otherwise false
*/
private Set<DataSegment> generateAndPushSegments(final TaskToolbox toolbox, final ParallelIndexSupervisorTaskClient taskClient, final InputSource inputSource, final File tmpDir) throws IOException, InterruptedException {
final DataSchema dataSchema = ingestionSchema.getDataSchema();
final GranularitySpec granularitySpec = dataSchema.getGranularitySpec();
final FireDepartment fireDepartmentForMetrics = new FireDepartment(dataSchema, new RealtimeIOConfig(null, null), null);
final FireDepartmentMetrics fireDepartmentMetrics = fireDepartmentForMetrics.getMetrics();
toolbox.addMonitor(new RealtimeMetricsMonitor(Collections.singletonList(fireDepartmentForMetrics), Collections.singletonMap(DruidMetrics.TASK_ID, new String[] { getId() })));
final ParallelIndexTuningConfig tuningConfig = ingestionSchema.getTuningConfig();
final DynamicPartitionsSpec partitionsSpec = (DynamicPartitionsSpec) tuningConfig.getGivenOrDefaultPartitionsSpec();
final long pushTimeout = tuningConfig.getPushTimeout();
final boolean explicitIntervals = !granularitySpec.inputIntervals().isEmpty();
final boolean useLineageBasedSegmentAllocation = getContextValue(SinglePhaseParallelIndexTaskRunner.CTX_USE_LINEAGE_BASED_SEGMENT_ALLOCATION_KEY, SinglePhaseParallelIndexTaskRunner.LEGACY_DEFAULT_USE_LINEAGE_BASED_SEGMENT_ALLOCATION);
// subtaskSpecId is used as the sequenceName, so that retry tasks for the same spec
// can allocate the same set of segments.
final String sequenceName = useLineageBasedSegmentAllocation ? Preconditions.checkNotNull(subtaskSpecId, "subtaskSpecId") : getId();
final SegmentAllocatorForBatch segmentAllocator = SegmentAllocators.forLinearPartitioning(toolbox, sequenceName, new SupervisorTaskAccess(getSupervisorTaskId(), taskClient), getIngestionSchema().getDataSchema(), getTaskLockHelper(), ingestionSchema.getIOConfig().isAppendToExisting(), partitionsSpec, useLineageBasedSegmentAllocation);
final boolean useMaxMemoryEstimates = getContextValue(Tasks.USE_MAX_MEMORY_ESTIMATES, Tasks.DEFAULT_USE_MAX_MEMORY_ESTIMATES);
final Appenderator appenderator = BatchAppenderators.newAppenderator(getId(), toolbox.getAppenderatorsManager(), fireDepartmentMetrics, toolbox, dataSchema, tuningConfig, rowIngestionMeters, parseExceptionHandler, useMaxMemoryEstimates);
boolean exceptionOccurred = false;
try (final BatchAppenderatorDriver driver = BatchAppenderators.newDriver(appenderator, toolbox, segmentAllocator);
final CloseableIterator<InputRow> inputRowIterator = AbstractBatchIndexTask.inputSourceReader(tmpDir, dataSchema, inputSource, inputSource.needsFormat() ? ParallelIndexSupervisorTask.getInputFormat(ingestionSchema) : null, inputRow -> {
if (inputRow == null) {
return false;
}
if (explicitIntervals) {
final Optional<Interval> optInterval = granularitySpec.bucketInterval(inputRow.getTimestamp());
return optInterval.isPresent();
}
return true;
}, rowIngestionMeters, parseExceptionHandler)) {
driver.startJob();
final Set<DataSegment> pushedSegments = new HashSet<>();
while (inputRowIterator.hasNext()) {
final InputRow inputRow = inputRowIterator.next();
// Segments are created as needed, using a single sequence name. They may be allocated from the overlord
// (in append mode) or may be created on our own authority (in overwrite mode).
final AppenderatorDriverAddResult addResult = driver.add(inputRow, sequenceName);
if (addResult.isOk()) {
final boolean isPushRequired = addResult.isPushRequired(partitionsSpec.getMaxRowsPerSegment(), partitionsSpec.getMaxTotalRowsOr(DynamicPartitionsSpec.DEFAULT_MAX_TOTAL_ROWS));
if (isPushRequired) {
// There can be some segments waiting for being published even though any rows won't be added to them.
// If those segments are not published here, the available space in appenderator will be kept to be small
// which makes the size of segments smaller.
final SegmentsAndCommitMetadata pushed = driver.pushAllAndClear(pushTimeout);
pushedSegments.addAll(pushed.getSegments());
LOG.info("Pushed [%s] segments", pushed.getSegments().size());
LOG.infoSegments(pushed.getSegments(), "Pushed segments");
}
} else {
throw new ISE("Failed to add a row with timestamp[%s]", inputRow.getTimestamp());
}
fireDepartmentMetrics.incrementProcessed();
}
final SegmentsAndCommitMetadata pushed = driver.pushAllAndClear(pushTimeout);
pushedSegments.addAll(pushed.getSegments());
LOG.info("Pushed [%s] segments", pushed.getSegments().size());
LOG.infoSegments(pushed.getSegments(), "Pushed segments");
appenderator.close();
return pushedSegments;
} catch (TimeoutException | ExecutionException e) {
exceptionOccurred = true;
throw new RuntimeException(e);
} catch (Exception e) {
exceptionOccurred = true;
throw e;
} finally {
if (exceptionOccurred) {
appenderator.closeNow();
} else {
appenderator.close();
}
}
}
use of org.apache.druid.segment.indexing.DataSchema in project druid by druid-io.
the class IndexIngestionSpecTest method testParserAndInputFormat.
@Test
public void testParserAndInputFormat() {
expectedException.expect(IllegalArgumentException.class);
expectedException.expectMessage("Cannot use parser and inputSource together. Try using inputFormat instead of parser.");
final IndexIngestionSpec spec = new IndexIngestionSpec(new DataSchema("dataSource", ImmutableMap.of("fake", "parser map"), new AggregatorFactory[0], new ArbitraryGranularitySpec(Granularities.NONE, null), null, null), new IndexIOConfig(null, new NoopInputSource(), new NoopInputFormat(), null, null), null);
}
use of org.apache.druid.segment.indexing.DataSchema in project druid by druid-io.
the class IndexIngestionSpecTest method testParserAndInputSource.
@Test
public void testParserAndInputSource() {
expectedException.expect(IllegalArgumentException.class);
expectedException.expectMessage("Cannot use parser and inputSource together.");
final IndexIngestionSpec spec = new IndexIngestionSpec(new DataSchema("dataSource", ImmutableMap.of("fake", "parser map"), new AggregatorFactory[0], new ArbitraryGranularitySpec(Granularities.NONE, null), null, null), new IndexIOConfig(null, new NoopInputSource(), null, null, null), null);
}
use of org.apache.druid.segment.indexing.DataSchema in project druid by druid-io.
the class CompactionTaskTest method assertIngestionSchema.
private void assertIngestionSchema(List<ParallelIndexIngestionSpec> ingestionSchemas, List<DimensionsSpec> expectedDimensionsSpecs, List<AggregatorFactory> expectedMetricsSpec, List<Interval> expectedSegmentIntervals, CompactionTask.CompactionTuningConfig expectedTuningConfig, Granularity expectedSegmentGranularity, Granularity expectedQueryGranularity, boolean expectedDropExisting) {
Preconditions.checkArgument(ingestionSchemas.size() == expectedDimensionsSpecs.size(), "ingesionSchemas.size()[%s] should be same with expectedDimensionsSpecs.size()[%s]", ingestionSchemas.size(), expectedDimensionsSpecs.size());
for (int i = 0; i < ingestionSchemas.size(); i++) {
final ParallelIndexIngestionSpec ingestionSchema = ingestionSchemas.get(i);
final DimensionsSpec expectedDimensionsSpec = expectedDimensionsSpecs.get(i);
// assert dataSchema
final DataSchema dataSchema = ingestionSchema.getDataSchema();
Assert.assertEquals(DATA_SOURCE, dataSchema.getDataSource());
Assert.assertEquals(new TimestampSpec(ColumnHolder.TIME_COLUMN_NAME, "millis", null), dataSchema.getTimestampSpec());
Assert.assertEquals(new HashSet<>(expectedDimensionsSpec.getDimensions()), new HashSet<>(dataSchema.getDimensionsSpec().getDimensions()));
// metrics
Assert.assertEquals(expectedMetricsSpec, Arrays.asList(dataSchema.getAggregators()));
Assert.assertEquals(new UniformGranularitySpec(expectedSegmentGranularity, expectedQueryGranularity, false, Collections.singletonList(expectedSegmentIntervals.get(i))), dataSchema.getGranularitySpec());
// assert ioConfig
final ParallelIndexIOConfig ioConfig = ingestionSchema.getIOConfig();
Assert.assertFalse(ioConfig.isAppendToExisting());
Assert.assertEquals(expectedDropExisting, ioConfig.isDropExisting());
final InputSource inputSource = ioConfig.getInputSource();
Assert.assertTrue(inputSource instanceof DruidInputSource);
final DruidInputSource druidInputSource = (DruidInputSource) inputSource;
Assert.assertEquals(DATA_SOURCE, druidInputSource.getDataSource());
Assert.assertEquals(expectedSegmentIntervals.get(i), druidInputSource.getInterval());
Assert.assertNull(druidInputSource.getDimFilter());
// assert tuningConfig
Assert.assertEquals(expectedTuningConfig, ingestionSchema.getTuningConfig());
}
}
Aggregations