Search in sources :

Example 6 with WindowedStorageAdapter

use of org.apache.druid.segment.realtime.firehose.WindowedStorageAdapter in project druid by druid-io.

the class IndexTaskTest method testNumShardsAndPartitionDimensionsProvided.

@Test
public void testNumShardsAndPartitionDimensionsProvided() throws Exception {
    final File tmpDir = temporaryFolder.newFolder();
    final File tmpFile = File.createTempFile("druid", "index", tmpDir);
    try (BufferedWriter writer = Files.newWriter(tmpFile, StandardCharsets.UTF_8)) {
        writer.write("2014-01-01T00:00:10Z,a,1\n");
        writer.write("2014-01-01T01:00:20Z,b,1\n");
        writer.write("2014-01-01T02:00:30Z,c,1\n");
    }
    final IndexTask indexTask = new IndexTask(null, null, createDefaultIngestionSpec(jsonMapper, tmpDir, null, null, createTuningConfigWithPartitionsSpec(new HashedPartitionsSpec(null, 2, ImmutableList.of("dim")), true), false, false), null);
    final List<DataSegment> segments = runTask(indexTask).rhs;
    Assert.assertEquals(2, segments.size());
    for (DataSegment segment : segments) {
        Assert.assertEquals(DATASOURCE, segment.getDataSource());
        Assert.assertEquals(Intervals.of("2014/P1D"), segment.getInterval());
        Assert.assertEquals(HashBasedNumberedShardSpec.class, segment.getShardSpec().getClass());
        final HashBasedNumberedShardSpec hashBasedNumberedShardSpec = (HashBasedNumberedShardSpec) segment.getShardSpec();
        Assert.assertEquals(HashPartitionFunction.MURMUR3_32_ABS, hashBasedNumberedShardSpec.getPartitionFunction());
        final File segmentFile = segmentCacheManager.getSegmentFiles(segment);
        final WindowedStorageAdapter adapter = new WindowedStorageAdapter(new QueryableIndexStorageAdapter(indexIO.loadIndex(segmentFile)), segment.getInterval());
        final Sequence<Cursor> cursorSequence = adapter.getAdapter().makeCursors(null, segment.getInterval(), VirtualColumns.EMPTY, Granularities.ALL, false, null);
        final List<Integer> hashes = cursorSequence.map(cursor -> {
            final DimensionSelector selector = cursor.getColumnSelectorFactory().makeDimensionSelector(new DefaultDimensionSpec("dim", "dim"));
            final int hash = HashPartitionFunction.MURMUR3_32_ABS.hash(HashBasedNumberedShardSpec.serializeGroupKey(jsonMapper, Collections.singletonList(selector.getObject())), hashBasedNumberedShardSpec.getNumBuckets());
            cursor.advance();
            return hash;
        }).toList();
        Assert.assertTrue(hashes.stream().allMatch(h -> h.intValue() == hashes.get(0)));
    }
}
Also used : HashBasedNumberedShardSpec(org.apache.druid.timeline.partition.HashBasedNumberedShardSpec) TaskReport(org.apache.druid.indexing.common.TaskReport) TaskToolbox(org.apache.druid.indexing.common.TaskToolbox) Arrays(java.util.Arrays) IndexSpec(org.apache.druid.segment.IndexSpec) Pair(org.apache.druid.java.util.common.Pair) Map(java.util.Map) ExpressionTransform(org.apache.druid.segment.transform.ExpressionTransform) AppenderatorsManager(org.apache.druid.segment.realtime.appenderator.AppenderatorsManager) JsonInputFormat(org.apache.druid.data.input.impl.JsonInputFormat) IAE(org.apache.druid.java.util.common.IAE) InputFormat(org.apache.druid.data.input.InputFormat) IngestionStatsAndErrorsTaskReportData(org.apache.druid.indexing.common.IngestionStatsAndErrorsTaskReportData) Set(java.util.Set) NoopSegmentHandoffNotifierFactory(org.apache.druid.segment.realtime.plumber.NoopSegmentHandoffNotifierFactory) EqualsVerifier(nl.jqno.equalsverifier.EqualsVerifier) StringDimensionSchema(org.apache.druid.data.input.impl.StringDimensionSchema) StandardCharsets(java.nio.charset.StandardCharsets) TaskState(org.apache.druid.indexer.TaskState) CountDownLatch(java.util.concurrent.CountDownLatch) PartitionIds(org.apache.druid.timeline.partition.PartitionIds) IndexTuningConfig(org.apache.druid.indexing.common.task.IndexTask.IndexTuningConfig) RowIngestionMetersFactory(org.apache.druid.segment.incremental.RowIngestionMetersFactory) SegmentLocalCacheManager(org.apache.druid.segment.loading.SegmentLocalCacheManager) SegmentId(org.apache.druid.timeline.SegmentId) TransformSpec(org.apache.druid.segment.transform.TransformSpec) Granularity(org.apache.druid.java.util.common.granularity.Granularity) SegmentLoaderConfig(org.apache.druid.segment.loading.SegmentLoaderConfig) HashBasedNumberedShardSpec(org.apache.druid.timeline.partition.HashBasedNumberedShardSpec) RunWith(org.junit.runner.RunWith) TaskStatus(org.apache.druid.indexer.TaskStatus) ArrayList(java.util.ArrayList) LinkedHashMap(java.util.LinkedHashMap) Interval(org.joda.time.Interval) StringInputRowParser(org.apache.druid.data.input.impl.StringInputRowParser) PartitionsSpec(org.apache.druid.indexer.partitions.PartitionsSpec) Nullable(javax.annotation.Nullable) HashPartitionFunction(org.apache.druid.timeline.partition.HashPartitionFunction) Before(org.junit.Before) BufferedWriter(java.io.BufferedWriter) GranularitySpec(org.apache.druid.segment.indexing.granularity.GranularitySpec) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) Test(org.junit.Test) IOException(java.io.IOException) EasyMock(org.easymock.EasyMock) File(java.io.File) Preconditions(com.google.common.base.Preconditions) Assert(org.junit.Assert) DataSchema(org.apache.druid.segment.indexing.DataSchema) CoreMatchers(org.hamcrest.CoreMatchers) ArbitraryGranularitySpec(org.apache.druid.segment.indexing.granularity.ArbitraryGranularitySpec) IndexIOConfig(org.apache.druid.indexing.common.task.IndexTask.IndexIOConfig) LocalInputSource(org.apache.druid.data.input.impl.LocalInputSource) LongDimensionSchema(org.apache.druid.data.input.impl.LongDimensionSchema) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) DefaultDimensionSpec(org.apache.druid.query.dimension.DefaultDimensionSpec) CSVParseSpec(org.apache.druid.data.input.impl.CSVParseSpec) LongSumAggregatorFactory(org.apache.druid.query.aggregation.LongSumAggregatorFactory) SelectorDimFilter(org.apache.druid.query.filter.SelectorDimFilter) Event(org.apache.druid.java.util.emitter.core.Event) DynamicPartitionsSpec(org.apache.druid.indexer.partitions.DynamicPartitionsSpec) TypeReference(com.fasterxml.jackson.core.type.TypeReference) Parameterized(org.junit.runners.Parameterized) ParseSpec(org.apache.druid.data.input.impl.ParseSpec) Sequence(org.apache.druid.java.util.common.guava.Sequence) ShardSpec(org.apache.druid.timeline.partition.ShardSpec) LocalFirehoseFactory(org.apache.druid.segment.realtime.firehose.LocalFirehoseFactory) ImmutableMap(com.google.common.collect.ImmutableMap) AggregatorFactory(org.apache.druid.query.aggregation.AggregatorFactory) NumberedShardSpec(org.apache.druid.timeline.partition.NumberedShardSpec) StringUtils(org.apache.druid.java.util.common.StringUtils) CsvInputFormat(org.apache.druid.data.input.impl.CsvInputFormat) HashedPartitionsSpec(org.apache.druid.indexer.partitions.HashedPartitionsSpec) Collectors(java.util.stream.Collectors) Sets(com.google.common.collect.Sets) LockGranularity(org.apache.druid.indexing.common.LockGranularity) ExprMacroTable(org.apache.druid.math.expr.ExprMacroTable) IndexIngestionSpec(org.apache.druid.indexing.common.task.IndexTask.IndexIngestionSpec) List(java.util.List) UniformGranularitySpec(org.apache.druid.segment.indexing.granularity.UniformGranularitySpec) ServiceEmitter(org.apache.druid.java.util.emitter.service.ServiceEmitter) DataSegment(org.apache.druid.timeline.DataSegment) SegmentHandoffNotifierFactory(org.apache.druid.segment.handoff.SegmentHandoffNotifierFactory) SegmentAllocateAction(org.apache.druid.indexing.common.actions.SegmentAllocateAction) Intervals(org.apache.druid.java.util.common.Intervals) HashMap(java.util.HashMap) RowIngestionMeters(org.apache.druid.segment.incremental.RowIngestionMeters) HashSet(java.util.HashSet) ImmutableList(com.google.common.collect.ImmutableList) FloatDimensionSchema(org.apache.druid.data.input.impl.FloatDimensionSchema) Files(com.google.common.io.Files) NumberedOverwriteShardSpec(org.apache.druid.timeline.partition.NumberedOverwriteShardSpec) DimensionSelector(org.apache.druid.segment.DimensionSelector) ExpectedException(org.junit.rules.ExpectedException) SegmentHandoffNotifier(org.apache.druid.segment.handoff.SegmentHandoffNotifier) NoopServiceEmitter(org.apache.druid.server.metrics.NoopServiceEmitter) QueryableIndexStorageAdapter(org.apache.druid.segment.QueryableIndexStorageAdapter) VirtualColumns(org.apache.druid.segment.VirtualColumns) WindowedStorageAdapter(org.apache.druid.segment.realtime.firehose.WindowedStorageAdapter) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) JSONParseSpec(org.apache.druid.data.input.impl.JSONParseSpec) StorageLocationConfig(org.apache.druid.segment.loading.StorageLocationConfig) Granularities(org.apache.druid.java.util.common.granularity.Granularities) TimeUnit(java.util.concurrent.TimeUnit) Rule(org.junit.Rule) SingleDimensionPartitionsSpec(org.apache.druid.indexer.partitions.SingleDimensionPartitionsSpec) Cursor(org.apache.druid.segment.Cursor) SegmentCacheManager(org.apache.druid.segment.loading.SegmentCacheManager) IndexIO(org.apache.druid.segment.IndexIO) Collections(java.util.Collections) TemporaryFolder(org.junit.rules.TemporaryFolder) HashedPartitionsSpec(org.apache.druid.indexer.partitions.HashedPartitionsSpec) DimensionSelector(org.apache.druid.segment.DimensionSelector) QueryableIndexStorageAdapter(org.apache.druid.segment.QueryableIndexStorageAdapter) Cursor(org.apache.druid.segment.Cursor) DataSegment(org.apache.druid.timeline.DataSegment) DefaultDimensionSpec(org.apache.druid.query.dimension.DefaultDimensionSpec) BufferedWriter(java.io.BufferedWriter) File(java.io.File) WindowedStorageAdapter(org.apache.druid.segment.realtime.firehose.WindowedStorageAdapter) Test(org.junit.Test)

Example 7 with WindowedStorageAdapter

use of org.apache.druid.segment.realtime.firehose.WindowedStorageAdapter in project druid by druid-io.

the class BatchDeltaIngestionTest method testIngestion.

private void testIngestion(HadoopDruidIndexerConfig config, List<ImmutableMap<String, Object>> expectedRowsGenerated, WindowedDataSegment windowedDataSegment, List<String> expectedDimensions, List<String> expectedMetrics) throws Exception {
    IndexGeneratorJob job = new IndexGeneratorJob(config);
    Assert.assertTrue(JobHelper.runJobs(ImmutableList.of(job)));
    List<DataSegmentAndIndexZipFilePath> dataSegmentAndIndexZipFilePaths = IndexGeneratorJob.getPublishedSegmentAndIndexZipFilePaths(config);
    JobHelper.renameIndexFilesForSegments(config.getSchema(), dataSegmentAndIndexZipFilePaths);
    JobHelper.maybeDeleteIntermediatePath(true, config.getSchema());
    File workingPath = new File(config.makeIntermediatePath().toUri().getPath());
    Assert.assertFalse(workingPath.exists());
    File segmentFolder = new File(StringUtils.format("%s/%s/%s_%s/%s/0", config.getSchema().getIOConfig().getSegmentOutputPath(), config.getSchema().getDataSchema().getDataSource(), INTERVAL_FULL.getStart().toString(), INTERVAL_FULL.getEnd().toString(), config.getSchema().getTuningConfig().getVersion()));
    Assert.assertTrue(segmentFolder.exists());
    File indexZip = new File(segmentFolder, "index.zip");
    Assert.assertTrue(indexZip.exists());
    File tmpUnzippedSegmentDir = temporaryFolder.newFolder();
    new LocalDataSegmentPuller().getSegmentFiles(indexZip, tmpUnzippedSegmentDir);
    QueryableIndex index = INDEX_IO.loadIndex(tmpUnzippedSegmentDir);
    StorageAdapter adapter = new QueryableIndexStorageAdapter(index);
    Firehose firehose = new IngestSegmentFirehose(ImmutableList.of(new WindowedStorageAdapter(adapter, windowedDataSegment.getInterval())), TransformSpec.NONE, expectedDimensions, expectedMetrics, null);
    List<InputRow> rows = new ArrayList<>();
    while (firehose.hasMore()) {
        rows.add(firehose.nextRow());
    }
    verifyRows(expectedRowsGenerated, rows, expectedDimensions, expectedMetrics);
}
Also used : IngestSegmentFirehose(org.apache.druid.segment.realtime.firehose.IngestSegmentFirehose) IngestSegmentFirehose(org.apache.druid.segment.realtime.firehose.IngestSegmentFirehose) Firehose(org.apache.druid.data.input.Firehose) ArrayList(java.util.ArrayList) StorageAdapter(org.apache.druid.segment.StorageAdapter) QueryableIndexStorageAdapter(org.apache.druid.segment.QueryableIndexStorageAdapter) WindowedStorageAdapter(org.apache.druid.segment.realtime.firehose.WindowedStorageAdapter) QueryableIndexStorageAdapter(org.apache.druid.segment.QueryableIndexStorageAdapter) LocalDataSegmentPuller(org.apache.druid.segment.loading.LocalDataSegmentPuller) QueryableIndex(org.apache.druid.segment.QueryableIndex) InputRow(org.apache.druid.data.input.InputRow) File(java.io.File) WindowedStorageAdapter(org.apache.druid.segment.realtime.firehose.WindowedStorageAdapter)

Example 8 with WindowedStorageAdapter

use of org.apache.druid.segment.realtime.firehose.WindowedStorageAdapter in project druid by druid-io.

the class DatasourceRecordReader method initialize.

@Override
public void initialize(InputSplit split, final TaskAttemptContext context) throws IOException {
    List<WindowedDataSegment> segments = ((DatasourceInputSplit) split).getSegments();
    String dataSource = Iterators.getOnlyElement(segments.stream().map(s -> s.getSegment().getDataSource()).distinct().iterator());
    spec = DatasourceInputFormat.getIngestionSpec(context.getConfiguration(), dataSource);
    logger.info("load schema [%s]", spec);
    List<WindowedStorageAdapter> adapters = Lists.transform(segments, new Function<WindowedDataSegment, WindowedStorageAdapter>() {

        @Override
        public WindowedStorageAdapter apply(WindowedDataSegment segment) {
            try {
                logger.info("Getting storage path for segment [%s]", segment.getSegment().getId());
                Path path = new Path(JobHelper.getURIFromSegment(segment.getSegment()));
                logger.info("Fetch segment files from [%s]", path);
                File dir = FileUtils.createTempDir();
                tmpSegmentDirs.add(dir);
                logger.info("Locally storing fetched segment at [%s]", dir);
                JobHelper.unzipNoGuava(path, context.getConfiguration(), dir, context, null);
                logger.info("finished fetching segment files");
                QueryableIndex index = HadoopDruidIndexerConfig.INDEX_IO.loadIndex(dir);
                indexes.add(index);
                numRows += index.getNumRows();
                return new WindowedStorageAdapter(new QueryableIndexStorageAdapter(index), segment.getInterval());
            } catch (IOException ex) {
                throw new RuntimeException(ex);
            }
        }
    });
    firehose = new IngestSegmentFirehose(adapters, spec.getTransformSpec(), spec.getDimensions(), spec.getMetrics(), spec.getFilter());
}
Also used : Logger(org.apache.druid.java.util.common.logger.Logger) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) Function(com.google.common.base.Function) QueryableIndexStorageAdapter(org.apache.druid.segment.QueryableIndexStorageAdapter) NullWritable(org.apache.hadoop.io.NullWritable) HadoopDruidIndexerConfig(org.apache.druid.indexer.HadoopDruidIndexerConfig) InputSplit(org.apache.hadoop.mapreduce.InputSplit) WindowedStorageAdapter(org.apache.druid.segment.realtime.firehose.WindowedStorageAdapter) QueryableIndex(org.apache.druid.segment.QueryableIndex) IOException(java.io.IOException) Iterators(com.google.common.collect.Iterators) File(java.io.File) RecordReader(org.apache.hadoop.mapreduce.RecordReader) ArrayList(java.util.ArrayList) Row(org.apache.druid.data.input.Row) InputRow(org.apache.druid.data.input.InputRow) List(java.util.List) Lists(com.google.common.collect.Lists) IngestSegmentFirehose(org.apache.druid.segment.realtime.firehose.IngestSegmentFirehose) Closeables(com.google.common.io.Closeables) Path(org.apache.hadoop.fs.Path) FileUtils(org.apache.druid.java.util.common.FileUtils) JobHelper(org.apache.druid.indexer.JobHelper) Path(org.apache.hadoop.fs.Path) IngestSegmentFirehose(org.apache.druid.segment.realtime.firehose.IngestSegmentFirehose) QueryableIndexStorageAdapter(org.apache.druid.segment.QueryableIndexStorageAdapter) IOException(java.io.IOException) QueryableIndex(org.apache.druid.segment.QueryableIndex) WindowedStorageAdapter(org.apache.druid.segment.realtime.firehose.WindowedStorageAdapter) File(java.io.File)

Aggregations

File (java.io.File)8 QueryableIndexStorageAdapter (org.apache.druid.segment.QueryableIndexStorageAdapter)8 WindowedStorageAdapter (org.apache.druid.segment.realtime.firehose.WindowedStorageAdapter)8 IOException (java.io.IOException)6 ArrayList (java.util.ArrayList)6 List (java.util.List)6 DataSegment (org.apache.druid.timeline.DataSegment)5 ImmutableList (com.google.common.collect.ImmutableList)4 Map (java.util.Map)4 DimensionsSpec (org.apache.druid.data.input.impl.DimensionsSpec)4 TimestampSpec (org.apache.druid.data.input.impl.TimestampSpec)4 Granularities (org.apache.druid.java.util.common.granularity.Granularities)4 Cursor (org.apache.druid.segment.Cursor)4 DimensionSelector (org.apache.druid.segment.DimensionSelector)4 SegmentCacheManager (org.apache.druid.segment.loading.SegmentCacheManager)4 TypeReference (com.fasterxml.jackson.core.type.TypeReference)3 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)3 Preconditions (com.google.common.base.Preconditions)3 ImmutableMap (com.google.common.collect.ImmutableMap)3 Sets (com.google.common.collect.Sets)3