Search in sources :

Example 1 with MapInputRowParser

use of io.druid.data.input.impl.MapInputRowParser in project hive by apache.

the class TestDruidRecordWriter method testWrite.

// This test need this patch https://github.com/druid-io/druid/pull/3483
@Ignore
@Test
public void testWrite() throws IOException, SegmentLoadingException {
    final String dataSourceName = "testDataSource";
    final File segmentOutputDir = temporaryFolder.newFolder();
    final File workingDir = temporaryFolder.newFolder();
    Configuration config = new Configuration();
    final InputRowParser inputRowParser = new MapInputRowParser(new TimeAndDimsParseSpec(new TimestampSpec(DruidTable.DEFAULT_TIMESTAMP_COLUMN, "auto", null), new DimensionsSpec(ImmutableList.<DimensionSchema>of(new StringDimensionSchema("host")), null, null)));
    final Map<String, Object> parserMap = objectMapper.convertValue(inputRowParser, Map.class);
    DataSchema dataSchema = new DataSchema(dataSourceName, parserMap, new AggregatorFactory[] { new LongSumAggregatorFactory("visited_sum", "visited_sum"), new HyperUniquesAggregatorFactory("unique_hosts", "unique_hosts") }, new UniformGranularitySpec(Granularity.DAY, QueryGranularities.NONE, ImmutableList.of(INTERVAL_FULL)), objectMapper);
    RealtimeTuningConfig tuningConfig = RealtimeTuningConfig.makeDefaultTuningConfig(temporaryFolder.newFolder());
    LocalFileSystem localFileSystem = FileSystem.getLocal(config);
    DataSegmentPusher dataSegmentPusher = new LocalDataSegmentPusher(new LocalDataSegmentPusherConfig() {

        @Override
        public File getStorageDirectory() {
            return segmentOutputDir;
        }
    }, objectMapper);
    Path segmentDescriptroPath = new Path(workingDir.getAbsolutePath(), DruidStorageHandler.SEGMENTS_DESCRIPTOR_DIR_NAME);
    druidRecordWriter = new DruidRecordWriter(dataSchema, tuningConfig, dataSegmentPusher, 20, segmentDescriptroPath, localFileSystem);
    List<DruidWritable> druidWritables = Lists.transform(expectedRows, new Function<ImmutableMap<String, Object>, DruidWritable>() {

        @Nullable
        @Override
        public DruidWritable apply(@Nullable ImmutableMap<String, Object> input) {
            return new DruidWritable(ImmutableMap.<String, Object>builder().putAll(input).put(Constants.DRUID_TIMESTAMP_GRANULARITY_COL_NAME, Granularity.DAY.truncate(new DateTime((long) input.get(DruidTable.DEFAULT_TIMESTAMP_COLUMN))).getMillis()).build());
        }
    });
    for (DruidWritable druidWritable : druidWritables) {
        druidRecordWriter.write(druidWritable);
    }
    druidRecordWriter.close(false);
    List<DataSegment> dataSegmentList = DruidStorageHandlerUtils.getPublishedSegments(segmentDescriptroPath, config);
    Assert.assertEquals(1, dataSegmentList.size());
    File tmpUnzippedSegmentDir = temporaryFolder.newFolder();
    new LocalDataSegmentPuller().getSegmentFiles(dataSegmentList.get(0), tmpUnzippedSegmentDir);
    final QueryableIndex queryableIndex = DruidStorageHandlerUtils.INDEX_IO.loadIndex(tmpUnzippedSegmentDir);
    QueryableIndexStorageAdapter adapter = new QueryableIndexStorageAdapter(queryableIndex);
    Firehose firehose = new IngestSegmentFirehose(ImmutableList.of(new WindowedStorageAdapter(adapter, adapter.getInterval())), ImmutableList.of("host"), ImmutableList.of("visited_sum", "unique_hosts"), null, QueryGranularities.NONE);
    List<InputRow> rows = Lists.newArrayList();
    while (firehose.hasMore()) {
        rows.add(firehose.nextRow());
    }
    verifyRows(expectedRows, rows);
}
Also used : IngestSegmentFirehose(io.druid.segment.realtime.firehose.IngestSegmentFirehose) LocalDataSegmentPusher(io.druid.segment.loading.LocalDataSegmentPusher) DataSegmentPusher(io.druid.segment.loading.DataSegmentPusher) Configuration(org.apache.hadoop.conf.Configuration) MapInputRowParser(io.druid.data.input.impl.MapInputRowParser) LongSumAggregatorFactory(io.druid.query.aggregation.LongSumAggregatorFactory) DataSegment(io.druid.timeline.DataSegment) DateTime(org.joda.time.DateTime) TimeAndDimsParseSpec(io.druid.data.input.impl.TimeAndDimsParseSpec) UniformGranularitySpec(io.druid.segment.indexing.granularity.UniformGranularitySpec) LocalDataSegmentPuller(io.druid.segment.loading.LocalDataSegmentPuller) TimestampSpec(io.druid.data.input.impl.TimestampSpec) WindowedStorageAdapter(io.druid.segment.realtime.firehose.WindowedStorageAdapter) Path(org.apache.hadoop.fs.Path) Firehose(io.druid.data.input.Firehose) IngestSegmentFirehose(io.druid.segment.realtime.firehose.IngestSegmentFirehose) LocalDataSegmentPusherConfig(io.druid.segment.loading.LocalDataSegmentPusherConfig) QueryableIndexStorageAdapter(io.druid.segment.QueryableIndexStorageAdapter) RealtimeTuningConfig(io.druid.segment.indexing.RealtimeTuningConfig) LocalDataSegmentPusher(io.druid.segment.loading.LocalDataSegmentPusher) ImmutableMap(com.google.common.collect.ImmutableMap) StringDimensionSchema(io.druid.data.input.impl.StringDimensionSchema) DataSchema(io.druid.segment.indexing.DataSchema) DruidWritable(org.apache.hadoop.hive.druid.serde.DruidWritable) LocalFileSystem(org.apache.hadoop.fs.LocalFileSystem) QueryableIndex(io.druid.segment.QueryableIndex) HyperUniquesAggregatorFactory(io.druid.query.aggregation.hyperloglog.HyperUniquesAggregatorFactory) InputRow(io.druid.data.input.InputRow) DimensionsSpec(io.druid.data.input.impl.DimensionsSpec) MapInputRowParser(io.druid.data.input.impl.MapInputRowParser) InputRowParser(io.druid.data.input.impl.InputRowParser) File(java.io.File) DruidRecordWriter(org.apache.hadoop.hive.druid.io.DruidRecordWriter) Nullable(javax.annotation.Nullable) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 2 with MapInputRowParser

use of io.druid.data.input.impl.MapInputRowParser in project hive by apache.

the class DruidOutputFormat method getHiveRecordWriter.

@Override
public FileSinkOperator.RecordWriter getHiveRecordWriter(JobConf jc, Path finalOutPath, Class<? extends Writable> valueClass, boolean isCompressed, Properties tableProperties, Progressable progress) throws IOException {
    final String segmentGranularity = tableProperties.getProperty(Constants.DRUID_SEGMENT_GRANULARITY) != null ? tableProperties.getProperty(Constants.DRUID_SEGMENT_GRANULARITY) : HiveConf.getVar(jc, HiveConf.ConfVars.HIVE_DRUID_INDEXING_GRANULARITY);
    final String dataSource = tableProperties.getProperty(Constants.DRUID_DATA_SOURCE);
    final String segmentDirectory = tableProperties.getProperty(Constants.DRUID_SEGMENT_DIRECTORY) != null ? tableProperties.getProperty(Constants.DRUID_SEGMENT_DIRECTORY) : HiveConf.getVar(jc, HiveConf.ConfVars.DRUID_SEGMENT_DIRECTORY);
    final HdfsDataSegmentPusherConfig hdfsDataSegmentPusherConfig = new HdfsDataSegmentPusherConfig();
    hdfsDataSegmentPusherConfig.setStorageDirectory(segmentDirectory);
    final DataSegmentPusher hdfsDataSegmentPusher = new HdfsDataSegmentPusher(hdfsDataSegmentPusherConfig, jc, DruidStorageHandlerUtils.JSON_MAPPER);
    final GranularitySpec granularitySpec = new UniformGranularitySpec(Granularity.valueOf(segmentGranularity), QueryGranularity.fromString(tableProperties.getProperty(Constants.DRUID_QUERY_GRANULARITY) == null ? "NONE" : tableProperties.getProperty(Constants.DRUID_QUERY_GRANULARITY)), null);
    final String columnNameProperty = tableProperties.getProperty(serdeConstants.LIST_COLUMNS);
    final String columnTypeProperty = tableProperties.getProperty(serdeConstants.LIST_COLUMN_TYPES);
    if (StringUtils.isEmpty(columnNameProperty) || StringUtils.isEmpty(columnTypeProperty)) {
        throw new IllegalStateException(String.format("List of columns names [%s] or columns type [%s] is/are not present", columnNameProperty, columnTypeProperty));
    }
    ArrayList<String> columnNames = new ArrayList<String>();
    for (String name : columnNameProperty.split(",")) {
        columnNames.add(name);
    }
    if (!columnNames.contains(DruidTable.DEFAULT_TIMESTAMP_COLUMN)) {
        throw new IllegalStateException("Timestamp column (' " + DruidTable.DEFAULT_TIMESTAMP_COLUMN + "') not specified in create table; list of columns is : " + tableProperties.getProperty(serdeConstants.LIST_COLUMNS));
    }
    ArrayList<TypeInfo> columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty);
    // Default, all columns that are not metrics or timestamp, are treated as dimensions
    final List<DimensionSchema> dimensions = new ArrayList<>();
    ImmutableList.Builder<AggregatorFactory> aggregatorFactoryBuilder = ImmutableList.builder();
    for (int i = 0; i < columnTypes.size(); i++) {
        PrimitiveTypeInfo f = (PrimitiveTypeInfo) columnTypes.get(i);
        AggregatorFactory af;
        switch(f.getPrimitiveCategory()) {
            case BYTE:
            case SHORT:
            case INT:
            case LONG:
                af = new LongSumAggregatorFactory(columnNames.get(i), columnNames.get(i));
                break;
            case FLOAT:
            case DOUBLE:
            case DECIMAL:
                af = new DoubleSumAggregatorFactory(columnNames.get(i), columnNames.get(i));
                break;
            case TIMESTAMP:
                String tColumnName = columnNames.get(i);
                if (!tColumnName.equals(DruidTable.DEFAULT_TIMESTAMP_COLUMN) && !tColumnName.equals(Constants.DRUID_TIMESTAMP_GRANULARITY_COL_NAME)) {
                    throw new IOException("Dimension " + tColumnName + " does not have STRING type: " + f.getPrimitiveCategory());
                }
                continue;
            default:
                // Dimension
                String dColumnName = columnNames.get(i);
                if (PrimitiveObjectInspectorUtils.getPrimitiveGrouping(f.getPrimitiveCategory()) != PrimitiveGrouping.STRING_GROUP) {
                    throw new IOException("Dimension " + dColumnName + " does not have STRING type: " + f.getPrimitiveCategory());
                }
                dimensions.add(new StringDimensionSchema(dColumnName));
                continue;
        }
        aggregatorFactoryBuilder.add(af);
    }
    List<AggregatorFactory> aggregatorFactories = aggregatorFactoryBuilder.build();
    final InputRowParser inputRowParser = new MapInputRowParser(new TimeAndDimsParseSpec(new TimestampSpec(DruidTable.DEFAULT_TIMESTAMP_COLUMN, "auto", null), new DimensionsSpec(dimensions, Lists.newArrayList(Constants.DRUID_TIMESTAMP_GRANULARITY_COL_NAME), null)));
    Map<String, Object> inputParser = DruidStorageHandlerUtils.JSON_MAPPER.convertValue(inputRowParser, Map.class);
    final DataSchema dataSchema = new DataSchema(Preconditions.checkNotNull(dataSource, "Data source name is null"), inputParser, aggregatorFactories.toArray(new AggregatorFactory[aggregatorFactories.size()]), granularitySpec, DruidStorageHandlerUtils.JSON_MAPPER);
    final String workingPath = jc.get(Constants.DRUID_JOB_WORKING_DIRECTORY);
    final String version = jc.get(Constants.DRUID_SEGMENT_VERSION);
    Integer maxPartitionSize = HiveConf.getIntVar(jc, HiveConf.ConfVars.HIVE_DRUID_MAX_PARTITION_SIZE);
    String basePersistDirectory = HiveConf.getVar(jc, HiveConf.ConfVars.HIVE_DRUID_BASE_PERSIST_DIRECTORY);
    if (Strings.isNullOrEmpty(basePersistDirectory)) {
        basePersistDirectory = System.getProperty("java.io.tmpdir");
    }
    Integer maxRowInMemory = HiveConf.getIntVar(jc, HiveConf.ConfVars.HIVE_DRUID_MAX_ROW_IN_MEMORY);
    RealtimeTuningConfig realtimeTuningConfig = new RealtimeTuningConfig(maxRowInMemory, null, null, new File(basePersistDirectory, dataSource), new CustomVersioningPolicy(version), null, null, null, null, true, 0, 0, true, null);
    LOG.debug(String.format("running with Data schema [%s] ", dataSchema));
    return new DruidRecordWriter(dataSchema, realtimeTuningConfig, hdfsDataSegmentPusher, maxPartitionSize, new Path(workingPath, SEGMENTS_DESCRIPTOR_DIR_NAME), finalOutPath.getFileSystem(jc));
}
Also used : DataSegmentPusher(io.druid.segment.loading.DataSegmentPusher) HdfsDataSegmentPusher(io.druid.storage.hdfs.HdfsDataSegmentPusher) MapInputRowParser(io.druid.data.input.impl.MapInputRowParser) ImmutableList(com.google.common.collect.ImmutableList) ArrayList(java.util.ArrayList) LongSumAggregatorFactory(io.druid.query.aggregation.LongSumAggregatorFactory) StringDimensionSchema(io.druid.data.input.impl.StringDimensionSchema) DimensionSchema(io.druid.data.input.impl.DimensionSchema) PrimitiveTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo) TimeAndDimsParseSpec(io.druid.data.input.impl.TimeAndDimsParseSpec) UniformGranularitySpec(io.druid.segment.indexing.granularity.UniformGranularitySpec) TimestampSpec(io.druid.data.input.impl.TimestampSpec) Path(org.apache.hadoop.fs.Path) DoubleSumAggregatorFactory(io.druid.query.aggregation.DoubleSumAggregatorFactory) HdfsDataSegmentPusherConfig(io.druid.storage.hdfs.HdfsDataSegmentPusherConfig) IOException(java.io.IOException) DoubleSumAggregatorFactory(io.druid.query.aggregation.DoubleSumAggregatorFactory) AggregatorFactory(io.druid.query.aggregation.AggregatorFactory) LongSumAggregatorFactory(io.druid.query.aggregation.LongSumAggregatorFactory) RealtimeTuningConfig(io.druid.segment.indexing.RealtimeTuningConfig) PrimitiveTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) StringDimensionSchema(io.druid.data.input.impl.StringDimensionSchema) DataSchema(io.druid.segment.indexing.DataSchema) GranularitySpec(io.druid.segment.indexing.granularity.GranularitySpec) UniformGranularitySpec(io.druid.segment.indexing.granularity.UniformGranularitySpec) DimensionsSpec(io.druid.data.input.impl.DimensionsSpec) MapInputRowParser(io.druid.data.input.impl.MapInputRowParser) InputRowParser(io.druid.data.input.impl.InputRowParser) CustomVersioningPolicy(io.druid.segment.realtime.plumber.CustomVersioningPolicy) HdfsDataSegmentPusher(io.druid.storage.hdfs.HdfsDataSegmentPusher) File(java.io.File)

Example 3 with MapInputRowParser

use of io.druid.data.input.impl.MapInputRowParser in project druid by druid-io.

the class DefaultOfflineAppenderatorFactoryTest method testBuild.

@Test
public void testBuild() throws IOException, SegmentNotWritableException {
    Injector injector = Initialization.makeInjectorWithModules(GuiceInjectors.makeStartupInjector(), ImmutableList.<Module>of(new Module() {

        @Override
        public void configure(Binder binder) {
            binder.bindConstant().annotatedWith(Names.named("serviceName")).to("druid/tool");
            binder.bindConstant().annotatedWith(Names.named("servicePort")).to(9999);
            binder.bind(DruidProcessingConfig.class).toInstance(new DruidProcessingConfig() {

                @Override
                public String getFormatString() {
                    return "processing-%s";
                }

                @Override
                public int intermediateComputeSizeBytes() {
                    return 100 * 1024 * 1024;
                }

                @Override
                public int getNumThreads() {
                    return 1;
                }

                @Override
                public int columnCacheSizeBytes() {
                    return 25 * 1024 * 1024;
                }
            });
            binder.bind(ColumnConfig.class).to(DruidProcessingConfig.class);
        }
    }));
    ObjectMapper objectMapper = injector.getInstance(ObjectMapper.class);
    AppenderatorFactory defaultOfflineAppenderatorFactory = objectMapper.reader(AppenderatorFactory.class).readValue("{\"type\":\"offline\"}");
    final Map<String, Object> parserMap = objectMapper.convertValue(new MapInputRowParser(new JSONParseSpec(new TimestampSpec("ts", "auto", null), new DimensionsSpec(null, null, null), null, null)), Map.class);
    DataSchema schema = new DataSchema("dataSourceName", parserMap, new AggregatorFactory[] { new CountAggregatorFactory("count"), new LongSumAggregatorFactory("met", "met") }, new UniformGranularitySpec(Granularities.MINUTE, Granularities.NONE, null), objectMapper);
    RealtimeTuningConfig tuningConfig = new RealtimeTuningConfig(75000, null, null, temporaryFolder.newFolder(), null, null, null, null, null, null, 0, 0, null, null);
    try (Appenderator appenderator = defaultOfflineAppenderatorFactory.build(schema, tuningConfig, new FireDepartmentMetrics())) {
        Assert.assertEquals("dataSourceName", appenderator.getDataSource());
        Assert.assertEquals(null, appenderator.startJob());
        SegmentIdentifier identifier = new SegmentIdentifier("dataSourceName", new Interval("2000/2001"), "A", new LinearShardSpec(0));
        Assert.assertEquals(0, ((AppenderatorImpl) appenderator).getRowsInMemory());
        appenderator.add(identifier, AppenderatorTest.IR("2000", "bar", 1), Suppliers.ofInstance(Committers.nil()));
        Assert.assertEquals(1, ((AppenderatorImpl) appenderator).getRowsInMemory());
        appenderator.add(identifier, AppenderatorTest.IR("2000", "baz", 1), Suppliers.ofInstance(Committers.nil()));
        Assert.assertEquals(2, ((AppenderatorImpl) appenderator).getRowsInMemory());
        appenderator.close();
        Assert.assertEquals(0, ((AppenderatorImpl) appenderator).getRowsInMemory());
    }
}
Also used : ColumnConfig(io.druid.segment.column.ColumnConfig) MapInputRowParser(io.druid.data.input.impl.MapInputRowParser) LinearShardSpec(io.druid.timeline.partition.LinearShardSpec) LongSumAggregatorFactory(io.druid.query.aggregation.LongSumAggregatorFactory) Binder(com.google.inject.Binder) UniformGranularitySpec(io.druid.segment.indexing.granularity.UniformGranularitySpec) Injector(com.google.inject.Injector) TimestampSpec(io.druid.data.input.impl.TimestampSpec) JSONParseSpec(io.druid.data.input.impl.JSONParseSpec) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) RealtimeTuningConfig(io.druid.segment.indexing.RealtimeTuningConfig) DataSchema(io.druid.segment.indexing.DataSchema) FireDepartmentMetrics(io.druid.segment.realtime.FireDepartmentMetrics) CountAggregatorFactory(io.druid.query.aggregation.CountAggregatorFactory) DimensionsSpec(io.druid.data.input.impl.DimensionsSpec) Module(com.google.inject.Module) DruidProcessingConfig(io.druid.query.DruidProcessingConfig) Interval(org.joda.time.Interval) Test(org.junit.Test)

Example 4 with MapInputRowParser

use of io.druid.data.input.impl.MapInputRowParser in project druid by druid-io.

the class EventReceiverFirehoseTest method setUp.

@Before
public void setUp() throws Exception {
    req = EasyMock.createMock(HttpServletRequest.class);
    eventReceiverFirehoseFactory = new EventReceiverFirehoseFactory(SERVICE_NAME, CAPACITY, null, new DefaultObjectMapper(), new DefaultObjectMapper(), register);
    firehose = (EventReceiverFirehoseFactory.EventReceiverFirehose) eventReceiverFirehoseFactory.connect(new MapInputRowParser(new JSONParseSpec(new TimestampSpec("timestamp", "auto", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("d1")), null, null), null, null)));
}
Also used : HttpServletRequest(javax.servlet.http.HttpServletRequest) MapInputRowParser(io.druid.data.input.impl.MapInputRowParser) TimestampSpec(io.druid.data.input.impl.TimestampSpec) DimensionsSpec(io.druid.data.input.impl.DimensionsSpec) DefaultObjectMapper(io.druid.jackson.DefaultObjectMapper) JSONParseSpec(io.druid.data.input.impl.JSONParseSpec) Before(org.junit.Before)

Example 5 with MapInputRowParser

use of io.druid.data.input.impl.MapInputRowParser in project druid by druid-io.

the class EventReceiverFirehoseTest method testDuplicateRegistering.

@Test(expected = ISE.class)
public void testDuplicateRegistering() throws IOException {
    EventReceiverFirehoseFactory eventReceiverFirehoseFactory2 = new EventReceiverFirehoseFactory(SERVICE_NAME, CAPACITY, null, new DefaultObjectMapper(), new DefaultObjectMapper(), register);
    EventReceiverFirehoseFactory.EventReceiverFirehose firehose2 = (EventReceiverFirehoseFactory.EventReceiverFirehose) eventReceiverFirehoseFactory2.connect(new MapInputRowParser(new JSONParseSpec(new TimestampSpec("timestamp", "auto", null), new DimensionsSpec(DimensionsSpec.getDefaultSchemas(ImmutableList.of("d1")), null, null), null, null)));
}
Also used : MapInputRowParser(io.druid.data.input.impl.MapInputRowParser) TimestampSpec(io.druid.data.input.impl.TimestampSpec) DimensionsSpec(io.druid.data.input.impl.DimensionsSpec) DefaultObjectMapper(io.druid.jackson.DefaultObjectMapper) JSONParseSpec(io.druid.data.input.impl.JSONParseSpec) Test(org.junit.Test)

Aggregations

DimensionsSpec (io.druid.data.input.impl.DimensionsSpec)6 MapInputRowParser (io.druid.data.input.impl.MapInputRowParser)6 TimestampSpec (io.druid.data.input.impl.TimestampSpec)6 JSONParseSpec (io.druid.data.input.impl.JSONParseSpec)4 LongSumAggregatorFactory (io.druid.query.aggregation.LongSumAggregatorFactory)4 InputRowParser (io.druid.data.input.impl.InputRowParser)3 DataSchema (io.druid.segment.indexing.DataSchema)3 RealtimeTuningConfig (io.druid.segment.indexing.RealtimeTuningConfig)3 UniformGranularitySpec (io.druid.segment.indexing.granularity.UniformGranularitySpec)3 DataSegmentPusher (io.druid.segment.loading.DataSegmentPusher)3 File (java.io.File)3 Test (org.junit.Test)3 ImmutableList (com.google.common.collect.ImmutableList)2 Binder (com.google.inject.Binder)2 Module (com.google.inject.Module)2 StringDimensionSchema (io.druid.data.input.impl.StringDimensionSchema)2 TimeAndDimsParseSpec (io.druid.data.input.impl.TimeAndDimsParseSpec)2 DefaultObjectMapper (io.druid.jackson.DefaultObjectMapper)2 AggregatorFactory (io.druid.query.aggregation.AggregatorFactory)2 DoubleSumAggregatorFactory (io.druid.query.aggregation.DoubleSumAggregatorFactory)2