Search in sources :

Example 6 with DimensionSchema

use of io.druid.data.input.impl.DimensionSchema in project hive by apache.

the class DruidOutputFormat method getHiveRecordWriter.

@Override
public FileSinkOperator.RecordWriter getHiveRecordWriter(JobConf jc, Path finalOutPath, Class<? extends Writable> valueClass, boolean isCompressed, Properties tableProperties, Progressable progress) throws IOException {
    final String segmentGranularity = tableProperties.getProperty(Constants.DRUID_SEGMENT_GRANULARITY) != null ? tableProperties.getProperty(Constants.DRUID_SEGMENT_GRANULARITY) : HiveConf.getVar(jc, HiveConf.ConfVars.HIVE_DRUID_INDEXING_GRANULARITY);
    final String dataSource = tableProperties.getProperty(Constants.DRUID_DATA_SOURCE);
    final String segmentDirectory = tableProperties.getProperty(Constants.DRUID_SEGMENT_DIRECTORY) != null ? tableProperties.getProperty(Constants.DRUID_SEGMENT_DIRECTORY) : HiveConf.getVar(jc, HiveConf.ConfVars.DRUID_SEGMENT_DIRECTORY);
    final HdfsDataSegmentPusherConfig hdfsDataSegmentPusherConfig = new HdfsDataSegmentPusherConfig();
    hdfsDataSegmentPusherConfig.setStorageDirectory(segmentDirectory);
    final DataSegmentPusher hdfsDataSegmentPusher = new HdfsDataSegmentPusher(hdfsDataSegmentPusherConfig, jc, DruidStorageHandlerUtils.JSON_MAPPER);
    final GranularitySpec granularitySpec = new UniformGranularitySpec(Granularity.valueOf(segmentGranularity), QueryGranularity.fromString(tableProperties.getProperty(Constants.DRUID_QUERY_GRANULARITY) == null ? "NONE" : tableProperties.getProperty(Constants.DRUID_QUERY_GRANULARITY)), null);
    final String columnNameProperty = tableProperties.getProperty(serdeConstants.LIST_COLUMNS);
    final String columnTypeProperty = tableProperties.getProperty(serdeConstants.LIST_COLUMN_TYPES);
    if (StringUtils.isEmpty(columnNameProperty) || StringUtils.isEmpty(columnTypeProperty)) {
        throw new IllegalStateException(String.format("List of columns names [%s] or columns type [%s] is/are not present", columnNameProperty, columnTypeProperty));
    }
    ArrayList<String> columnNames = new ArrayList<String>();
    for (String name : columnNameProperty.split(",")) {
        columnNames.add(name);
    }
    if (!columnNames.contains(DruidTable.DEFAULT_TIMESTAMP_COLUMN)) {
        throw new IllegalStateException("Timestamp column (' " + DruidTable.DEFAULT_TIMESTAMP_COLUMN + "') not specified in create table; list of columns is : " + tableProperties.getProperty(serdeConstants.LIST_COLUMNS));
    }
    ArrayList<TypeInfo> columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty);
    // Default, all columns that are not metrics or timestamp, are treated as dimensions
    final List<DimensionSchema> dimensions = new ArrayList<>();
    ImmutableList.Builder<AggregatorFactory> aggregatorFactoryBuilder = ImmutableList.builder();
    for (int i = 0; i < columnTypes.size(); i++) {
        PrimitiveTypeInfo f = (PrimitiveTypeInfo) columnTypes.get(i);
        AggregatorFactory af;
        switch(f.getPrimitiveCategory()) {
            case BYTE:
            case SHORT:
            case INT:
            case LONG:
                af = new LongSumAggregatorFactory(columnNames.get(i), columnNames.get(i));
                break;
            case FLOAT:
            case DOUBLE:
            case DECIMAL:
                af = new DoubleSumAggregatorFactory(columnNames.get(i), columnNames.get(i));
                break;
            case TIMESTAMP:
                String tColumnName = columnNames.get(i);
                if (!tColumnName.equals(DruidTable.DEFAULT_TIMESTAMP_COLUMN) && !tColumnName.equals(Constants.DRUID_TIMESTAMP_GRANULARITY_COL_NAME)) {
                    throw new IOException("Dimension " + tColumnName + " does not have STRING type: " + f.getPrimitiveCategory());
                }
                continue;
            default:
                // Dimension
                String dColumnName = columnNames.get(i);
                if (PrimitiveObjectInspectorUtils.getPrimitiveGrouping(f.getPrimitiveCategory()) != PrimitiveGrouping.STRING_GROUP) {
                    throw new IOException("Dimension " + dColumnName + " does not have STRING type: " + f.getPrimitiveCategory());
                }
                dimensions.add(new StringDimensionSchema(dColumnName));
                continue;
        }
        aggregatorFactoryBuilder.add(af);
    }
    List<AggregatorFactory> aggregatorFactories = aggregatorFactoryBuilder.build();
    final InputRowParser inputRowParser = new MapInputRowParser(new TimeAndDimsParseSpec(new TimestampSpec(DruidTable.DEFAULT_TIMESTAMP_COLUMN, "auto", null), new DimensionsSpec(dimensions, Lists.newArrayList(Constants.DRUID_TIMESTAMP_GRANULARITY_COL_NAME), null)));
    Map<String, Object> inputParser = DruidStorageHandlerUtils.JSON_MAPPER.convertValue(inputRowParser, Map.class);
    final DataSchema dataSchema = new DataSchema(Preconditions.checkNotNull(dataSource, "Data source name is null"), inputParser, aggregatorFactories.toArray(new AggregatorFactory[aggregatorFactories.size()]), granularitySpec, DruidStorageHandlerUtils.JSON_MAPPER);
    final String workingPath = jc.get(Constants.DRUID_JOB_WORKING_DIRECTORY);
    final String version = jc.get(Constants.DRUID_SEGMENT_VERSION);
    Integer maxPartitionSize = HiveConf.getIntVar(jc, HiveConf.ConfVars.HIVE_DRUID_MAX_PARTITION_SIZE);
    String basePersistDirectory = HiveConf.getVar(jc, HiveConf.ConfVars.HIVE_DRUID_BASE_PERSIST_DIRECTORY);
    if (Strings.isNullOrEmpty(basePersistDirectory)) {
        basePersistDirectory = System.getProperty("java.io.tmpdir");
    }
    Integer maxRowInMemory = HiveConf.getIntVar(jc, HiveConf.ConfVars.HIVE_DRUID_MAX_ROW_IN_MEMORY);
    RealtimeTuningConfig realtimeTuningConfig = new RealtimeTuningConfig(maxRowInMemory, null, null, new File(basePersistDirectory, dataSource), new CustomVersioningPolicy(version), null, null, null, null, true, 0, 0, true, null);
    LOG.debug(String.format("running with Data schema [%s] ", dataSchema));
    return new DruidRecordWriter(dataSchema, realtimeTuningConfig, hdfsDataSegmentPusher, maxPartitionSize, new Path(workingPath, SEGMENTS_DESCRIPTOR_DIR_NAME), finalOutPath.getFileSystem(jc));
}
Also used : DataSegmentPusher(io.druid.segment.loading.DataSegmentPusher) HdfsDataSegmentPusher(io.druid.storage.hdfs.HdfsDataSegmentPusher) MapInputRowParser(io.druid.data.input.impl.MapInputRowParser) ImmutableList(com.google.common.collect.ImmutableList) ArrayList(java.util.ArrayList) LongSumAggregatorFactory(io.druid.query.aggregation.LongSumAggregatorFactory) StringDimensionSchema(io.druid.data.input.impl.StringDimensionSchema) DimensionSchema(io.druid.data.input.impl.DimensionSchema) PrimitiveTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo) TimeAndDimsParseSpec(io.druid.data.input.impl.TimeAndDimsParseSpec) UniformGranularitySpec(io.druid.segment.indexing.granularity.UniformGranularitySpec) TimestampSpec(io.druid.data.input.impl.TimestampSpec) Path(org.apache.hadoop.fs.Path) DoubleSumAggregatorFactory(io.druid.query.aggregation.DoubleSumAggregatorFactory) HdfsDataSegmentPusherConfig(io.druid.storage.hdfs.HdfsDataSegmentPusherConfig) IOException(java.io.IOException) DoubleSumAggregatorFactory(io.druid.query.aggregation.DoubleSumAggregatorFactory) AggregatorFactory(io.druid.query.aggregation.AggregatorFactory) LongSumAggregatorFactory(io.druid.query.aggregation.LongSumAggregatorFactory) RealtimeTuningConfig(io.druid.segment.indexing.RealtimeTuningConfig) PrimitiveTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) StringDimensionSchema(io.druid.data.input.impl.StringDimensionSchema) DataSchema(io.druid.segment.indexing.DataSchema) GranularitySpec(io.druid.segment.indexing.granularity.GranularitySpec) UniformGranularitySpec(io.druid.segment.indexing.granularity.UniformGranularitySpec) DimensionsSpec(io.druid.data.input.impl.DimensionsSpec) MapInputRowParser(io.druid.data.input.impl.MapInputRowParser) InputRowParser(io.druid.data.input.impl.InputRowParser) CustomVersioningPolicy(io.druid.segment.realtime.plumber.CustomVersioningPolicy) HdfsDataSegmentPusher(io.druid.storage.hdfs.HdfsDataSegmentPusher) File(java.io.File)

Example 7 with DimensionSchema

use of io.druid.data.input.impl.DimensionSchema in project druid by druid-io.

the class SegmentAnalyzerTest method testIncrementalWorksHelper.

private void testIncrementalWorksHelper(EnumSet<SegmentMetadataQuery.AnalysisType> analyses) throws Exception {
    final List<SegmentAnalysis> results = getSegmentAnalysises(new IncrementalIndexSegment(TestIndex.getIncrementalTestIndex(), null), analyses);
    Assert.assertEquals(1, results.size());
    final SegmentAnalysis analysis = results.get(0);
    Assert.assertEquals(null, analysis.getId());
    final Map<String, ColumnAnalysis> columns = analysis.getColumns();
    Assert.assertEquals(TestIndex.COLUMNS.length, columns.size());
    for (DimensionSchema schema : TestIndex.DIMENSION_SCHEMAS) {
        final String dimension = schema.getName();
        final ColumnAnalysis columnAnalysis = columns.get(dimension);
        final boolean isString = schema.getValueType().name().equals(ValueType.STRING.name());
        Assert.assertEquals(dimension, schema.getValueType().name(), columnAnalysis.getType());
        Assert.assertEquals(dimension, 0, columnAnalysis.getSize());
        if (isString) {
            if (analyses == null) {
                Assert.assertTrue(dimension, columnAnalysis.getCardinality() > 0);
            } else {
                Assert.assertEquals(dimension, 0, columnAnalysis.getCardinality().longValue());
            }
        } else {
            Assert.assertNull(dimension, columnAnalysis.getCardinality());
        }
    }
    for (String metric : TestIndex.METRICS) {
        final ColumnAnalysis columnAnalysis = columns.get(metric);
        Assert.assertEquals(metric, ValueType.FLOAT.name(), columnAnalysis.getType());
        Assert.assertEquals(metric, 0, columnAnalysis.getSize());
        Assert.assertNull(metric, columnAnalysis.getCardinality());
    }
}
Also used : IncrementalIndexSegment(io.druid.segment.IncrementalIndexSegment) ColumnAnalysis(io.druid.query.metadata.metadata.ColumnAnalysis) SegmentAnalysis(io.druid.query.metadata.metadata.SegmentAnalysis) DimensionSchema(io.druid.data.input.impl.DimensionSchema)

Aggregations

DimensionSchema (io.druid.data.input.impl.DimensionSchema)7 DimensionsSpec (io.druid.data.input.impl.DimensionsSpec)4 StringDimensionSchema (io.druid.data.input.impl.StringDimensionSchema)4 TimestampSpec (io.druid.data.input.impl.TimestampSpec)3 AggregatorFactory (io.druid.query.aggregation.AggregatorFactory)3 TimeAndDimsParseSpec (io.druid.data.input.impl.TimeAndDimsParseSpec)2 ColumnAnalysis (io.druid.query.metadata.metadata.ColumnAnalysis)2 SegmentAnalysis (io.druid.query.metadata.metadata.SegmentAnalysis)2 DataSchema (io.druid.segment.indexing.DataSchema)2 UniformGranularitySpec (io.druid.segment.indexing.granularity.UniformGranularitySpec)2 ArrayList (java.util.ArrayList)2 Function (com.google.common.base.Function)1 ImmutableList (com.google.common.collect.ImmutableList)1 ImmutableMap (com.google.common.collect.ImmutableMap)1 MapBasedInputRow (io.druid.data.input.MapBasedInputRow)1 MapBasedRow (io.druid.data.input.MapBasedRow)1 InputRowParser (io.druid.data.input.impl.InputRowParser)1 JSONParseSpec (io.druid.data.input.impl.JSONParseSpec)1 JSONPathSpec (io.druid.data.input.impl.JSONPathSpec)1 MapInputRowParser (io.druid.data.input.impl.MapInputRowParser)1