Search in sources :

Example 1 with CustomVersioningPolicy

use of org.apache.druid.segment.realtime.plumber.CustomVersioningPolicy in project hive by apache.

the class DruidOutputFormat method getHiveRecordWriter.

@Override
public FileSinkOperator.RecordWriter getHiveRecordWriter(JobConf jc, Path finalOutPath, Class<? extends Writable> valueClass, boolean isCompressed, Properties tableProperties, Progressable progress) throws IOException {
    final int targetNumShardsPerGranularity = Integer.parseUnsignedInt(tableProperties.getProperty(Constants.DRUID_TARGET_SHARDS_PER_GRANULARITY, "0"));
    final int maxPartitionSize = targetNumShardsPerGranularity > 0 ? -1 : HiveConf.getIntVar(jc, HiveConf.ConfVars.HIVE_DRUID_MAX_PARTITION_SIZE);
    // If datasource is in the table properties, it is an INSERT/INSERT OVERWRITE as the datasource
    // name was already persisted. Otherwise, it is a CT/CTAS and we need to get the name from the
    // job properties that are set by configureOutputJobProperties in the DruidStorageHandler
    final String dataSource = tableProperties.getProperty(Constants.DRUID_DATA_SOURCE) == null ? jc.get(Constants.DRUID_DATA_SOURCE) : tableProperties.getProperty(Constants.DRUID_DATA_SOURCE);
    final String segmentDirectory = jc.get(DruidConstants.DRUID_SEGMENT_INTERMEDIATE_DIRECTORY);
    final GranularitySpec granularitySpec = DruidStorageHandlerUtils.getGranularitySpec(jc, tableProperties);
    final String columnNameProperty = tableProperties.getProperty(serdeConstants.LIST_COLUMNS);
    final String columnTypeProperty = tableProperties.getProperty(serdeConstants.LIST_COLUMN_TYPES);
    if (StringUtils.isEmpty(columnNameProperty) || StringUtils.isEmpty(columnTypeProperty)) {
        throw new IllegalStateException(String.format("List of columns names [%s] or columns type [%s] is/are not present", columnNameProperty, columnTypeProperty));
    }
    ArrayList<String> columnNames = Lists.newArrayList(columnNameProperty.split(","));
    if (!columnNames.contains(DruidConstants.DEFAULT_TIMESTAMP_COLUMN)) {
        throw new IllegalStateException("Timestamp column (' " + DruidConstants.DEFAULT_TIMESTAMP_COLUMN + "') not specified in create table; list of columns is : " + tableProperties.getProperty(serdeConstants.LIST_COLUMNS));
    }
    ArrayList<TypeInfo> columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty);
    Pair<List<DimensionSchema>, AggregatorFactory[]> dimensionsAndAggregates = DruidStorageHandlerUtils.getDimensionsAndAggregates(columnNames, columnTypes);
    final InputRowParser inputRowParser = new MapInputRowParser(new TimeAndDimsParseSpec(new TimestampSpec(DruidConstants.DEFAULT_TIMESTAMP_COLUMN, "auto", null), new DimensionsSpec(dimensionsAndAggregates.lhs, Lists.newArrayList(Constants.DRUID_TIMESTAMP_GRANULARITY_COL_NAME, Constants.DRUID_SHARD_KEY_COL_NAME), null)));
    Map<String, Object> inputParser = DruidStorageHandlerUtils.JSON_MAPPER.convertValue(inputRowParser, new TypeReference<Map<String, Object>>() {
    });
    final DataSchema dataSchema = new DataSchema(Preconditions.checkNotNull(dataSource, "Data source name is null"), inputParser, dimensionsAndAggregates.rhs, granularitySpec, null, DruidStorageHandlerUtils.JSON_MAPPER);
    final String workingPath = jc.get(DruidConstants.DRUID_JOB_WORKING_DIRECTORY);
    final String version = jc.get(DruidConstants.DRUID_SEGMENT_VERSION);
    String basePersistDirectory = HiveConf.getVar(jc, HiveConf.ConfVars.HIVE_DRUID_BASE_PERSIST_DIRECTORY);
    if (Strings.isNullOrEmpty(basePersistDirectory)) {
        basePersistDirectory = System.getProperty("java.io.tmpdir");
    }
    Integer maxRowInMemory = HiveConf.getIntVar(jc, HiveConf.ConfVars.HIVE_DRUID_MAX_ROW_IN_MEMORY);
    IndexSpec indexSpec = DruidStorageHandlerUtils.getIndexSpec(jc);
    RealtimeTuningConfig realtimeTuningConfig = new RealtimeTuningConfig(maxRowInMemory, null, null, null, new File(basePersistDirectory, dataSource), new CustomVersioningPolicy(version), null, null, null, indexSpec, null, true, 0, 0, true, null, 0L, null, null);
    LOG.debug(String.format("running with Data schema [%s] ", dataSchema));
    return new DruidRecordWriter(dataSchema, realtimeTuningConfig, DruidStorageHandlerUtils.createSegmentPusherForDirectory(segmentDirectory, jc), maxPartitionSize, new Path(workingPath, SEGMENTS_DESCRIPTOR_DIR_NAME), finalOutPath.getFileSystem(jc));
}
Also used : IndexSpec(org.apache.druid.segment.IndexSpec) MapInputRowParser(org.apache.druid.data.input.impl.MapInputRowParser) TimeAndDimsParseSpec(org.apache.druid.data.input.impl.TimeAndDimsParseSpec) TimestampSpec(org.apache.druid.data.input.impl.TimestampSpec) ArrayList(java.util.ArrayList) List(java.util.List) Path(org.apache.hadoop.fs.Path) RealtimeTuningConfig(org.apache.druid.segment.indexing.RealtimeTuningConfig) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) DataSchema(org.apache.druid.segment.indexing.DataSchema) GranularitySpec(org.apache.druid.segment.indexing.granularity.GranularitySpec) DimensionsSpec(org.apache.druid.data.input.impl.DimensionsSpec) MapInputRowParser(org.apache.druid.data.input.impl.MapInputRowParser) InputRowParser(org.apache.druid.data.input.impl.InputRowParser) CustomVersioningPolicy(org.apache.druid.segment.realtime.plumber.CustomVersioningPolicy) Map(java.util.Map) File(java.io.File)

Aggregations

File (java.io.File)1 ArrayList (java.util.ArrayList)1 List (java.util.List)1 Map (java.util.Map)1 DimensionsSpec (org.apache.druid.data.input.impl.DimensionsSpec)1 InputRowParser (org.apache.druid.data.input.impl.InputRowParser)1 MapInputRowParser (org.apache.druid.data.input.impl.MapInputRowParser)1 TimeAndDimsParseSpec (org.apache.druid.data.input.impl.TimeAndDimsParseSpec)1 TimestampSpec (org.apache.druid.data.input.impl.TimestampSpec)1 IndexSpec (org.apache.druid.segment.IndexSpec)1 DataSchema (org.apache.druid.segment.indexing.DataSchema)1 RealtimeTuningConfig (org.apache.druid.segment.indexing.RealtimeTuningConfig)1 GranularitySpec (org.apache.druid.segment.indexing.granularity.GranularitySpec)1 CustomVersioningPolicy (org.apache.druid.segment.realtime.plumber.CustomVersioningPolicy)1 Path (org.apache.hadoop.fs.Path)1 TypeInfo (org.apache.hadoop.hive.serde2.typeinfo.TypeInfo)1