Search in sources :

Example 16 with QueryableIndex

use of io.druid.segment.QueryableIndex in project druid by druid-io.

the class DatasourceRecordReader method initialize.

@Override
public void initialize(InputSplit split, final TaskAttemptContext context) throws IOException, InterruptedException {
    spec = readAndVerifyDatasourceIngestionSpec(context.getConfiguration(), HadoopDruidIndexerConfig.JSON_MAPPER);
    List<WindowedDataSegment> segments = ((DatasourceInputSplit) split).getSegments();
    List<WindowedStorageAdapter> adapters = Lists.transform(segments, new Function<WindowedDataSegment, WindowedStorageAdapter>() {

        @Override
        public WindowedStorageAdapter apply(WindowedDataSegment segment) {
            try {
                logger.info("Getting storage path for segment [%s]", segment.getSegment().getIdentifier());
                Path path = new Path(JobHelper.getURIFromSegment(segment.getSegment()));
                logger.info("Fetch segment files from [%s]", path);
                File dir = Files.createTempDir();
                tmpSegmentDirs.add(dir);
                logger.info("Locally storing fetched segment at [%s]", dir);
                JobHelper.unzipNoGuava(path, context.getConfiguration(), dir, context);
                logger.info("finished fetching segment files");
                QueryableIndex index = HadoopDruidIndexerConfig.INDEX_IO.loadIndex(dir);
                indexes.add(index);
                numRows += index.getNumRows();
                return new WindowedStorageAdapter(new QueryableIndexStorageAdapter(index), segment.getInterval());
            } catch (IOException ex) {
                throw Throwables.propagate(ex);
            }
        }
    });
    firehose = new IngestSegmentFirehose(adapters, spec.getDimensions(), spec.getMetrics(), spec.getFilter(), spec.getGranularity());
}
Also used : Path(org.apache.hadoop.fs.Path) IngestSegmentFirehose(io.druid.segment.realtime.firehose.IngestSegmentFirehose) QueryableIndexStorageAdapter(io.druid.segment.QueryableIndexStorageAdapter) IOException(java.io.IOException) QueryableIndex(io.druid.segment.QueryableIndex) WindowedStorageAdapter(io.druid.segment.realtime.firehose.WindowedStorageAdapter) File(java.io.File)

Example 17 with QueryableIndex

use of io.druid.segment.QueryableIndex in project druid by druid-io.

the class BatchDeltaIngestionTest method testIngestion.

private void testIngestion(HadoopDruidIndexerConfig config, List<ImmutableMap<String, Object>> expectedRowsGenerated, WindowedDataSegment windowedDataSegment) throws Exception {
    IndexGeneratorJob job = new IndexGeneratorJob(config);
    JobHelper.runJobs(ImmutableList.<Jobby>of(job), config);
    File segmentFolder = new File(String.format("%s/%s/%s_%s/%s/0", config.getSchema().getIOConfig().getSegmentOutputPath(), config.getSchema().getDataSchema().getDataSource(), INTERVAL_FULL.getStart().toString(), INTERVAL_FULL.getEnd().toString(), config.getSchema().getTuningConfig().getVersion()));
    Assert.assertTrue(segmentFolder.exists());
    File descriptor = new File(segmentFolder, "descriptor.json");
    File indexZip = new File(segmentFolder, "index.zip");
    Assert.assertTrue(descriptor.exists());
    Assert.assertTrue(indexZip.exists());
    DataSegment dataSegment = MAPPER.readValue(descriptor, DataSegment.class);
    Assert.assertEquals("website", dataSegment.getDataSource());
    Assert.assertEquals(config.getSchema().getTuningConfig().getVersion(), dataSegment.getVersion());
    Assert.assertEquals(INTERVAL_FULL, dataSegment.getInterval());
    Assert.assertEquals("local", dataSegment.getLoadSpec().get("type"));
    Assert.assertEquals(indexZip.getCanonicalPath(), dataSegment.getLoadSpec().get("path"));
    Assert.assertEquals("host", dataSegment.getDimensions().get(0));
    Assert.assertEquals("visited_sum", dataSegment.getMetrics().get(0));
    Assert.assertEquals("unique_hosts", dataSegment.getMetrics().get(1));
    Assert.assertEquals(Integer.valueOf(9), dataSegment.getBinaryVersion());
    HashBasedNumberedShardSpec spec = (HashBasedNumberedShardSpec) dataSegment.getShardSpec();
    Assert.assertEquals(0, spec.getPartitionNum());
    Assert.assertEquals(1, spec.getPartitions());
    File tmpUnzippedSegmentDir = temporaryFolder.newFolder();
    new LocalDataSegmentPuller().getSegmentFiles(dataSegment, tmpUnzippedSegmentDir);
    QueryableIndex index = INDEX_IO.loadIndex(tmpUnzippedSegmentDir);
    StorageAdapter adapter = new QueryableIndexStorageAdapter(index);
    Firehose firehose = new IngestSegmentFirehose(ImmutableList.of(new WindowedStorageAdapter(adapter, windowedDataSegment.getInterval())), ImmutableList.of("host"), ImmutableList.of("visited_sum", "unique_hosts"), null, Granularities.NONE);
    List<InputRow> rows = Lists.newArrayList();
    while (firehose.hasMore()) {
        rows.add(firehose.nextRow());
    }
    verifyRows(expectedRowsGenerated, rows);
}
Also used : HashBasedNumberedShardSpec(io.druid.timeline.partition.HashBasedNumberedShardSpec) IngestSegmentFirehose(io.druid.segment.realtime.firehose.IngestSegmentFirehose) Firehose(io.druid.data.input.Firehose) IngestSegmentFirehose(io.druid.segment.realtime.firehose.IngestSegmentFirehose) QueryableIndexStorageAdapter(io.druid.segment.QueryableIndexStorageAdapter) StorageAdapter(io.druid.segment.StorageAdapter) WindowedStorageAdapter(io.druid.segment.realtime.firehose.WindowedStorageAdapter) QueryableIndexStorageAdapter(io.druid.segment.QueryableIndexStorageAdapter) DataSegment(io.druid.timeline.DataSegment) WindowedDataSegment(io.druid.indexer.hadoop.WindowedDataSegment) LocalDataSegmentPuller(io.druid.segment.loading.LocalDataSegmentPuller) QueryableIndex(io.druid.segment.QueryableIndex) InputRow(io.druid.data.input.InputRow) File(java.io.File) WindowedStorageAdapter(io.druid.segment.realtime.firehose.WindowedStorageAdapter)

Example 18 with QueryableIndex

use of io.druid.segment.QueryableIndex in project druid by druid-io.

the class DruidSchemaTest method setUp.

@Before
public void setUp() throws Exception {
    Calcites.setSystemProperties();
    final File tmpDir = temporaryFolder.newFolder();
    final QueryableIndex index1 = IndexBuilder.create().tmpDir(new File(tmpDir, "1")).indexMerger(TestHelper.getTestIndexMergerV9()).schema(new IncrementalIndexSchema.Builder().withMetrics(new AggregatorFactory[] { new CountAggregatorFactory("cnt"), new DoubleSumAggregatorFactory("m1", "m1"), new HyperUniquesAggregatorFactory("unique_dim1", "dim1") }).withRollup(false).build()).rows(ROWS1).buildMMappedIndex();
    final QueryableIndex index2 = IndexBuilder.create().tmpDir(new File(tmpDir, "2")).indexMerger(TestHelper.getTestIndexMergerV9()).schema(new IncrementalIndexSchema.Builder().withMetrics(new AggregatorFactory[] { new LongSumAggregatorFactory("m1", "m1") }).withRollup(false).build()).rows(ROWS2).buildMMappedIndex();
    walker = new SpecificSegmentsQuerySegmentWalker(CalciteTests.queryRunnerFactoryConglomerate()).add(DataSegment.builder().dataSource(CalciteTests.DATASOURCE1).interval(new Interval("2000/P1Y")).version("1").shardSpec(new LinearShardSpec(0)).build(), index1).add(DataSegment.builder().dataSource(CalciteTests.DATASOURCE1).interval(new Interval("2001/P1Y")).version("1").shardSpec(new LinearShardSpec(0)).build(), index2).add(DataSegment.builder().dataSource(CalciteTests.DATASOURCE2).interval(index2.getDataInterval()).version("1").shardSpec(new LinearShardSpec(0)).build(), index2);
    schema = new DruidSchema(walker, new TestServerInventoryView(walker.getSegments()), PLANNER_CONFIG_DEFAULT);
    schema.start();
    schema.awaitInitialization();
}
Also used : DoubleSumAggregatorFactory(io.druid.query.aggregation.DoubleSumAggregatorFactory) TestServerInventoryView(io.druid.sql.calcite.util.TestServerInventoryView) LinearShardSpec(io.druid.timeline.partition.LinearShardSpec) IndexBuilder(io.druid.segment.IndexBuilder) LongSumAggregatorFactory(io.druid.query.aggregation.LongSumAggregatorFactory) AggregatorFactory(io.druid.query.aggregation.AggregatorFactory) CountAggregatorFactory(io.druid.query.aggregation.CountAggregatorFactory) HyperUniquesAggregatorFactory(io.druid.query.aggregation.hyperloglog.HyperUniquesAggregatorFactory) DoubleSumAggregatorFactory(io.druid.query.aggregation.DoubleSumAggregatorFactory) LongSumAggregatorFactory(io.druid.query.aggregation.LongSumAggregatorFactory) CountAggregatorFactory(io.druid.query.aggregation.CountAggregatorFactory) SpecificSegmentsQuerySegmentWalker(io.druid.sql.calcite.util.SpecificSegmentsQuerySegmentWalker) QueryableIndex(io.druid.segment.QueryableIndex) HyperUniquesAggregatorFactory(io.druid.query.aggregation.hyperloglog.HyperUniquesAggregatorFactory) File(java.io.File) Interval(org.joda.time.Interval) Before(org.junit.Before)

Example 19 with QueryableIndex

use of io.druid.segment.QueryableIndex in project druid by druid-io.

the class CalciteTests method createMockWalker.

public static SpecificSegmentsQuerySegmentWalker createMockWalker(final File tmpDir) {
    final QueryableIndex index1 = IndexBuilder.create().tmpDir(new File(tmpDir, "1")).indexMerger(TestHelper.getTestIndexMergerV9()).schema(INDEX_SCHEMA).rows(ROWS1).buildMMappedIndex();
    final QueryableIndex index2 = IndexBuilder.create().tmpDir(new File(tmpDir, "2")).indexMerger(TestHelper.getTestIndexMergerV9()).schema(INDEX_SCHEMA).rows(ROWS2).buildMMappedIndex();
    return new SpecificSegmentsQuerySegmentWalker(queryRunnerFactoryConglomerate()).add(DataSegment.builder().dataSource(DATASOURCE1).interval(index1.getDataInterval()).version("1").shardSpec(new LinearShardSpec(0)).build(), index1).add(DataSegment.builder().dataSource(DATASOURCE2).interval(index2.getDataInterval()).version("1").shardSpec(new LinearShardSpec(0)).build(), index2);
}
Also used : QueryableIndex(io.druid.segment.QueryableIndex) LinearShardSpec(io.druid.timeline.partition.LinearShardSpec) File(java.io.File)

Example 20 with QueryableIndex

use of io.druid.segment.QueryableIndex in project druid by druid-io.

the class SegmentAnalyzer method analyze.

public Map<String, ColumnAnalysis> analyze(Segment segment) {
    Preconditions.checkNotNull(segment, "segment");
    // index is null for incremental-index-based segments, but storageAdapter is always available
    final QueryableIndex index = segment.asQueryableIndex();
    final StorageAdapter storageAdapter = segment.asStorageAdapter();
    // get length and column names from storageAdapter
    final int length = storageAdapter.getNumRows();
    final Set<String> columnNames = Sets.newHashSet();
    Iterables.addAll(columnNames, storageAdapter.getAvailableDimensions());
    Iterables.addAll(columnNames, storageAdapter.getAvailableMetrics());
    Map<String, ColumnAnalysis> columns = Maps.newTreeMap();
    for (String columnName : columnNames) {
        final Column column = index == null ? null : index.getColumn(columnName);
        final ColumnCapabilities capabilities = column != null ? column.getCapabilities() : storageAdapter.getColumnCapabilities(columnName);
        final ColumnAnalysis analysis;
        final ValueType type = capabilities.getType();
        switch(type) {
            case LONG:
                analysis = analyzeNumericColumn(capabilities, length, Longs.BYTES);
                break;
            case FLOAT:
                analysis = analyzeNumericColumn(capabilities, length, NUM_BYTES_IN_TEXT_FLOAT);
                break;
            case STRING:
                if (index != null) {
                    analysis = analyzeStringColumn(capabilities, column);
                } else {
                    analysis = analyzeStringColumn(capabilities, storageAdapter, columnName);
                }
                break;
            case COMPLEX:
                analysis = analyzeComplexColumn(capabilities, column, storageAdapter.getColumnTypeName(columnName));
                break;
            default:
                log.warn("Unknown column type[%s].", type);
                analysis = ColumnAnalysis.error(String.format("unknown_type_%s", type));
        }
        columns.put(columnName, analysis);
    }
    // Add time column too
    ColumnCapabilities timeCapabilities = storageAdapter.getColumnCapabilities(Column.TIME_COLUMN_NAME);
    if (timeCapabilities == null) {
        timeCapabilities = new ColumnCapabilitiesImpl().setType(ValueType.LONG).setHasMultipleValues(false);
    }
    columns.put(Column.TIME_COLUMN_NAME, analyzeNumericColumn(timeCapabilities, length, NUM_BYTES_IN_TIMESTAMP));
    return columns;
}
Also used : ComplexColumn(io.druid.segment.column.ComplexColumn) Column(io.druid.segment.column.Column) ValueType(io.druid.segment.column.ValueType) QueryableIndex(io.druid.segment.QueryableIndex) ColumnAnalysis(io.druid.query.metadata.metadata.ColumnAnalysis) StorageAdapter(io.druid.segment.StorageAdapter) ColumnCapabilities(io.druid.segment.column.ColumnCapabilities) ColumnCapabilitiesImpl(io.druid.segment.column.ColumnCapabilitiesImpl)

Aggregations

QueryableIndex (io.druid.segment.QueryableIndex)35 File (java.io.File)23 IncrementalIndex (io.druid.segment.incremental.IncrementalIndex)16 OnheapIncrementalIndex (io.druid.segment.incremental.OnheapIncrementalIndex)12 InputRow (io.druid.data.input.InputRow)10 IndexSpec (io.druid.segment.IndexSpec)10 QueryableIndexSegment (io.druid.segment.QueryableIndexSegment)8 IOException (java.io.IOException)8 BenchmarkDataGenerator (io.druid.benchmark.datagen.BenchmarkDataGenerator)7 HyperUniquesSerde (io.druid.query.aggregation.hyperloglog.HyperUniquesSerde)7 Setup (org.openjdk.jmh.annotations.Setup)7 FireHydrant (io.druid.segment.realtime.FireHydrant)6 DataSegment (io.druid.timeline.DataSegment)6 QueryableIndexStorageAdapter (io.druid.segment.QueryableIndexStorageAdapter)5 StorageAdapter (io.druid.segment.StorageAdapter)5 DateTime (org.joda.time.DateTime)5 Interval (org.joda.time.Interval)5 IncrementalIndexSegment (io.druid.segment.IncrementalIndexSegment)4 IndexSizeExceededException (io.druid.segment.incremental.IndexSizeExceededException)4 ImmutableList (com.google.common.collect.ImmutableList)3