Examples with DataSchema - org.apache.druid.segment.indexing.DataSchema

Example 46 with DataSchema

use of org.apache.druid.segment.indexing.DataSchema in project druid by druid-io.

the class SinglePhaseParallelIndexingTest method testIngestBothExplicitAndImplicitDims.

@Test
public void testIngestBothExplicitAndImplicitDims() throws IOException {
    final Interval interval = Intervals.of("2017-12/P1M");
    for (int i = 0; i < 5; i++) {
        try (final Writer writer = Files.newBufferedWriter(new File(inputDir, "test_" + i + ".json").toPath(), StandardCharsets.UTF_8)) {
            writer.write(getObjectMapper().writeValueAsString(ImmutableMap.of("ts", StringUtils.format("2017-12-%d", 24 + i), "implicitDim", "implicit_" + i, "explicitDim", "explicit_" + i)));
            writer.write(getObjectMapper().writeValueAsString(ImmutableMap.of("ts", StringUtils.format("2017-12-%d", 25 + i), "implicitDim", "implicit_" + i, "explicitDim", "explicit_" + i)));
        }
    }
    final ParallelIndexSupervisorTask task = new ParallelIndexSupervisorTask(null, null, null, new ParallelIndexIngestionSpec(new DataSchema("dataSource", DEFAULT_TIMESTAMP_SPEC, DimensionsSpec.builder().setDefaultSchemaDimensions(ImmutableList.of("ts", "explicitDim")).setIncludeAllDimensions(true).build(), new AggregatorFactory[] { new CountAggregatorFactory("cnt") }, new UniformGranularitySpec(Granularities.DAY, Granularities.MINUTE, Collections.singletonList(interval)), null), new ParallelIndexIOConfig(null, new SettableSplittableLocalInputSource(inputDir, "*.json", true), new JsonInputFormat(new JSONPathSpec(true, null), null, null), false, null), AbstractParallelIndexSupervisorTaskTest.DEFAULT_TUNING_CONFIG_FOR_PARALLEL_INDEXING), null);
    task.addToContext(Tasks.FORCE_TIME_CHUNK_LOCK_KEY, lockGranularity == LockGranularity.TIME_CHUNK);
    Assert.assertEquals(TaskState.SUCCESS, getIndexingServiceClient().runAndWait(task).getStatusCode());
    Set<DataSegment> segments = getIndexingServiceClient().getPublishedSegments(task);
    for (DataSegment segment : segments) {
        Assert.assertEquals(ImmutableList.of("ts", "explicitDim", "implicitDim"), segment.getDimensions());
    }
}

Also used : DataSegment(org.apache.druid.timeline.DataSegment) DataSchema(org.apache.druid.segment.indexing.DataSchema) UniformGranularitySpec(org.apache.druid.segment.indexing.granularity.UniformGranularitySpec) JsonInputFormat(org.apache.druid.data.input.impl.JsonInputFormat) CountAggregatorFactory(org.apache.druid.query.aggregation.CountAggregatorFactory) JSONPathSpec(org.apache.druid.java.util.common.parsers.JSONPathSpec) File(java.io.File) Writer(java.io.Writer) Interval(org.joda.time.Interval) Test(org.junit.Test)

Example 47 with DataSchema

use of org.apache.druid.segment.indexing.DataSchema in project druid by druid-io.

the class SinglePhaseSubTask method generateAndPushSegments.

/**
 * This method reads input data row by row and adds the read row to a proper segment using {@link BaseAppenderatorDriver}.
 * If there is no segment for the row, a new one is created.  Segments can be published in the middle of reading inputs
 * if one of below conditions are satisfied.
 *
 * <ul>
 * <li>
 * If the number of rows in a segment exceeds {@link DynamicPartitionsSpec#maxRowsPerSegment}
 * </li>
 * <li>
 * If the number of rows added to {@link BaseAppenderatorDriver} so far exceeds {@link DynamicPartitionsSpec#maxTotalRows}
 * </li>
 * </ul>
 * <p>
 * At the end of this method, all the remaining segments are published.
 *
 * @return true if generated segments are successfully published, otherwise false
 */
private Set<DataSegment> generateAndPushSegments(final TaskToolbox toolbox, final ParallelIndexSupervisorTaskClient taskClient, final InputSource inputSource, final File tmpDir) throws IOException, InterruptedException {
    final DataSchema dataSchema = ingestionSchema.getDataSchema();
    final GranularitySpec granularitySpec = dataSchema.getGranularitySpec();
    final FireDepartment fireDepartmentForMetrics = new FireDepartment(dataSchema, new RealtimeIOConfig(null, null), null);
    final FireDepartmentMetrics fireDepartmentMetrics = fireDepartmentForMetrics.getMetrics();
    toolbox.addMonitor(new RealtimeMetricsMonitor(Collections.singletonList(fireDepartmentForMetrics), Collections.singletonMap(DruidMetrics.TASK_ID, new String[] { getId() })));
    final ParallelIndexTuningConfig tuningConfig = ingestionSchema.getTuningConfig();
    final DynamicPartitionsSpec partitionsSpec = (DynamicPartitionsSpec) tuningConfig.getGivenOrDefaultPartitionsSpec();
    final long pushTimeout = tuningConfig.getPushTimeout();
    final boolean explicitIntervals = !granularitySpec.inputIntervals().isEmpty();
    final boolean useLineageBasedSegmentAllocation = getContextValue(SinglePhaseParallelIndexTaskRunner.CTX_USE_LINEAGE_BASED_SEGMENT_ALLOCATION_KEY, SinglePhaseParallelIndexTaskRunner.LEGACY_DEFAULT_USE_LINEAGE_BASED_SEGMENT_ALLOCATION);
    // subtaskSpecId is used as the sequenceName, so that retry tasks for the same spec
    // can allocate the same set of segments.
    final String sequenceName = useLineageBasedSegmentAllocation ? Preconditions.checkNotNull(subtaskSpecId, "subtaskSpecId") : getId();
    final SegmentAllocatorForBatch segmentAllocator = SegmentAllocators.forLinearPartitioning(toolbox, sequenceName, new SupervisorTaskAccess(getSupervisorTaskId(), taskClient), getIngestionSchema().getDataSchema(), getTaskLockHelper(), ingestionSchema.getIOConfig().isAppendToExisting(), partitionsSpec, useLineageBasedSegmentAllocation);
    final boolean useMaxMemoryEstimates = getContextValue(Tasks.USE_MAX_MEMORY_ESTIMATES, Tasks.DEFAULT_USE_MAX_MEMORY_ESTIMATES);
    final Appenderator appenderator = BatchAppenderators.newAppenderator(getId(), toolbox.getAppenderatorsManager(), fireDepartmentMetrics, toolbox, dataSchema, tuningConfig, rowIngestionMeters, parseExceptionHandler, useMaxMemoryEstimates);
    boolean exceptionOccurred = false;
    try (final BatchAppenderatorDriver driver = BatchAppenderators.newDriver(appenderator, toolbox, segmentAllocator);
        final CloseableIterator<InputRow> inputRowIterator = AbstractBatchIndexTask.inputSourceReader(tmpDir, dataSchema, inputSource, inputSource.needsFormat() ? ParallelIndexSupervisorTask.getInputFormat(ingestionSchema) : null, inputRow -> {
            if (inputRow == null) {
                return false;
            }
            if (explicitIntervals) {
                final Optional<Interval> optInterval = granularitySpec.bucketInterval(inputRow.getTimestamp());
                return optInterval.isPresent();
            }
            return true;
        }, rowIngestionMeters, parseExceptionHandler)) {
        driver.startJob();
        final Set<DataSegment> pushedSegments = new HashSet<>();
        while (inputRowIterator.hasNext()) {
            final InputRow inputRow = inputRowIterator.next();
            // Segments are created as needed, using a single sequence name. They may be allocated from the overlord
            // (in append mode) or may be created on our own authority (in overwrite mode).
            final AppenderatorDriverAddResult addResult = driver.add(inputRow, sequenceName);
            if (addResult.isOk()) {
                final boolean isPushRequired = addResult.isPushRequired(partitionsSpec.getMaxRowsPerSegment(), partitionsSpec.getMaxTotalRowsOr(DynamicPartitionsSpec.DEFAULT_MAX_TOTAL_ROWS));
                if (isPushRequired) {
                    // There can be some segments waiting for being published even though any rows won't be added to them.
                    // If those segments are not published here, the available space in appenderator will be kept to be small
                    // which makes the size of segments smaller.
                    final SegmentsAndCommitMetadata pushed = driver.pushAllAndClear(pushTimeout);
                    pushedSegments.addAll(pushed.getSegments());
                    LOG.info("Pushed [%s] segments", pushed.getSegments().size());
                    LOG.infoSegments(pushed.getSegments(), "Pushed segments");
                }
            } else {
                throw new ISE("Failed to add a row with timestamp[%s]", inputRow.getTimestamp());
            }
            fireDepartmentMetrics.incrementProcessed();
        }
        final SegmentsAndCommitMetadata pushed = driver.pushAllAndClear(pushTimeout);
        pushedSegments.addAll(pushed.getSegments());
        LOG.info("Pushed [%s] segments", pushed.getSegments().size());
        LOG.infoSegments(pushed.getSegments(), "Pushed segments");
        appenderator.close();
        return pushedSegments;
    } catch (TimeoutException | ExecutionException e) {
        exceptionOccurred = true;
        throw new RuntimeException(e);
    } catch (Exception e) {
        exceptionOccurred = true;
        throw e;
    } finally {
        if (exceptionOccurred) {
            appenderator.closeNow();
        } else {
            appenderator.close();
        }
    }
}

Also used : RealtimeIOConfig(org.apache.druid.segment.indexing.RealtimeIOConfig) SegmentsAndCommitMetadata(org.apache.druid.segment.realtime.appenderator.SegmentsAndCommitMetadata) DataSegment(org.apache.druid.timeline.DataSegment) FireDepartment(org.apache.druid.segment.realtime.FireDepartment) ISE(org.apache.druid.java.util.common.ISE) ExecutionException(java.util.concurrent.ExecutionException) HashSet(java.util.HashSet) TimeoutException(java.util.concurrent.TimeoutException) BatchAppenderatorDriver(org.apache.druid.segment.realtime.appenderator.BatchAppenderatorDriver) AppenderatorDriverAddResult(org.apache.druid.segment.realtime.appenderator.AppenderatorDriverAddResult) TimeoutException(java.util.concurrent.TimeoutException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) DataSchema(org.apache.druid.segment.indexing.DataSchema) FireDepartmentMetrics(org.apache.druid.segment.realtime.FireDepartmentMetrics) DynamicPartitionsSpec(org.apache.druid.indexer.partitions.DynamicPartitionsSpec) SegmentAllocatorForBatch(org.apache.druid.indexing.common.task.SegmentAllocatorForBatch) Appenderator(org.apache.druid.segment.realtime.appenderator.Appenderator) ArbitraryGranularitySpec(org.apache.druid.segment.indexing.granularity.ArbitraryGranularitySpec) GranularitySpec(org.apache.druid.segment.indexing.granularity.GranularitySpec) InputRow(org.apache.druid.data.input.InputRow) RealtimeMetricsMonitor(org.apache.druid.segment.realtime.RealtimeMetricsMonitor) Interval(org.joda.time.Interval)

Example 48 with DataSchema

use of org.apache.druid.segment.indexing.DataSchema in project druid by druid-io.

the class IndexIngestionSpecTest method testParserAndInputFormat.

@Test
public void testParserAndInputFormat() {
    expectedException.expect(IllegalArgumentException.class);
    expectedException.expectMessage("Cannot use parser and inputSource together. Try using inputFormat instead of parser.");
    final IndexIngestionSpec spec = new IndexIngestionSpec(new DataSchema("dataSource", ImmutableMap.of("fake", "parser map"), new AggregatorFactory[0], new ArbitraryGranularitySpec(Granularities.NONE, null), null, null), new IndexIOConfig(null, new NoopInputSource(), new NoopInputFormat(), null, null), null);
}

Also used : DataSchema(org.apache.druid.segment.indexing.DataSchema) IndexIngestionSpec(org.apache.druid.indexing.common.task.IndexTask.IndexIngestionSpec) IndexIOConfig(org.apache.druid.indexing.common.task.IndexTask.IndexIOConfig) NoopInputSource(org.apache.druid.data.input.impl.NoopInputSource) NoopInputFormat(org.apache.druid.data.input.impl.NoopInputFormat) AggregatorFactory(org.apache.druid.query.aggregation.AggregatorFactory) ArbitraryGranularitySpec(org.apache.druid.segment.indexing.granularity.ArbitraryGranularitySpec) Test(org.junit.Test)

Example 49 with DataSchema

use of org.apache.druid.segment.indexing.DataSchema in project druid by druid-io.

the class IndexIngestionSpecTest method testParserAndInputSource.

@Test
public void testParserAndInputSource() {
    expectedException.expect(IllegalArgumentException.class);
    expectedException.expectMessage("Cannot use parser and inputSource together.");
    final IndexIngestionSpec spec = new IndexIngestionSpec(new DataSchema("dataSource", ImmutableMap.of("fake", "parser map"), new AggregatorFactory[0], new ArbitraryGranularitySpec(Granularities.NONE, null), null, null), new IndexIOConfig(null, new NoopInputSource(), null, null, null), null);
}

Also used : DataSchema(org.apache.druid.segment.indexing.DataSchema) IndexIngestionSpec(org.apache.druid.indexing.common.task.IndexTask.IndexIngestionSpec) IndexIOConfig(org.apache.druid.indexing.common.task.IndexTask.IndexIOConfig) NoopInputSource(org.apache.druid.data.input.impl.NoopInputSource) AggregatorFactory(org.apache.druid.query.aggregation.AggregatorFactory) ArbitraryGranularitySpec(org.apache.druid.segment.indexing.granularity.ArbitraryGranularitySpec) Test(org.junit.Test)

Example 50 with DataSchema

use of org.apache.druid.segment.indexing.DataSchema in project druid by druid-io.

the class CompactionTaskTest method assertIngestionSchema.

private void assertIngestionSchema(List<ParallelIndexIngestionSpec> ingestionSchemas, List<DimensionsSpec> expectedDimensionsSpecs, List<AggregatorFactory> expectedMetricsSpec, List<Interval> expectedSegmentIntervals, CompactionTask.CompactionTuningConfig expectedTuningConfig, Granularity expectedSegmentGranularity, Granularity expectedQueryGranularity, boolean expectedDropExisting) {
    Preconditions.checkArgument(ingestionSchemas.size() == expectedDimensionsSpecs.size(), "ingesionSchemas.size()[%s] should be same with expectedDimensionsSpecs.size()[%s]", ingestionSchemas.size(), expectedDimensionsSpecs.size());
    for (int i = 0; i < ingestionSchemas.size(); i++) {
        final ParallelIndexIngestionSpec ingestionSchema = ingestionSchemas.get(i);
        final DimensionsSpec expectedDimensionsSpec = expectedDimensionsSpecs.get(i);
        // assert dataSchema
        final DataSchema dataSchema = ingestionSchema.getDataSchema();
        Assert.assertEquals(DATA_SOURCE, dataSchema.getDataSource());
        Assert.assertEquals(new TimestampSpec(ColumnHolder.TIME_COLUMN_NAME, "millis", null), dataSchema.getTimestampSpec());
        Assert.assertEquals(new HashSet<>(expectedDimensionsSpec.getDimensions()), new HashSet<>(dataSchema.getDimensionsSpec().getDimensions()));
        // metrics
        Assert.assertEquals(expectedMetricsSpec, Arrays.asList(dataSchema.getAggregators()));
        Assert.assertEquals(new UniformGranularitySpec(expectedSegmentGranularity, expectedQueryGranularity, false, Collections.singletonList(expectedSegmentIntervals.get(i))), dataSchema.getGranularitySpec());
        // assert ioConfig
        final ParallelIndexIOConfig ioConfig = ingestionSchema.getIOConfig();
        Assert.assertFalse(ioConfig.isAppendToExisting());
        Assert.assertEquals(expectedDropExisting, ioConfig.isDropExisting());
        final InputSource inputSource = ioConfig.getInputSource();
        Assert.assertTrue(inputSource instanceof DruidInputSource);
        final DruidInputSource druidInputSource = (DruidInputSource) inputSource;
        Assert.assertEquals(DATA_SOURCE, druidInputSource.getDataSource());
        Assert.assertEquals(expectedSegmentIntervals.get(i), druidInputSource.getInterval());
        Assert.assertNull(druidInputSource.getDimFilter());
        // assert tuningConfig
        Assert.assertEquals(expectedTuningConfig, ingestionSchema.getTuningConfig());
    }
}

Also used : DataSchema(org.apache.druid.segment.indexing.DataSchema) UniformGranularitySpec(org.apache.druid.segment.indexing.granularity.UniformGranularitySpec) DruidInputSource(org.apache.druid.indexing.input.DruidInputSource) InputSource(org.apache.druid.data.input.InputSource) ParallelIndexIOConfig(org.apache.druid.indexing.common.task.batch.parallel.ParallelIndexIOConfig) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) DruidInputSource(org.apache.druid.indexing.input.DruidInputSource) ParallelIndexIngestionSpec(org.apache.druid.indexing.common.task.batch.parallel.ParallelIndexIngestionSpec) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec)

Aggregations

DataSchema (org.apache.druid.segment.indexing.DataSchema)80 UniformGranularitySpec (org.apache.druid.segment.indexing.granularity.UniformGranularitySpec)49 TimestampSpec (org.apache.druid.data.input.impl.TimestampSpec)45 Test (org.junit.Test)44 DimensionsSpec (org.apache.druid.data.input.impl.DimensionsSpec)32 AggregatorFactory (org.apache.druid.query.aggregation.AggregatorFactory)25 LongSumAggregatorFactory (org.apache.druid.query.aggregation.LongSumAggregatorFactory)22 GranularitySpec (org.apache.druid.segment.indexing.granularity.GranularitySpec)19 InputSource (org.apache.druid.data.input.InputSource)17 InitializedNullHandlingTest (org.apache.druid.testing.InitializedNullHandlingTest)17 File (java.io.File)16 Map (java.util.Map)15 InputFormat (org.apache.druid.data.input.InputFormat)15 CountAggregatorFactory (org.apache.druid.query.aggregation.CountAggregatorFactory)15 SamplerResponse (org.apache.druid.client.indexing.SamplerResponse)14 SamplerResponseRow (org.apache.druid.client.indexing.SamplerResponse.SamplerResponseRow)13 CsvInputFormat (org.apache.druid.data.input.impl.CsvInputFormat)13 Interval (org.joda.time.Interval)13 ArrayList (java.util.ArrayList)12 JsonInputFormat (org.apache.druid.data.input.impl.JsonInputFormat)12