use of io.druid.storage.hdfs.HdfsDataSegmentPusherConfig in project hive by apache.
the class DruidOutputFormat method getHiveRecordWriter.
@Override
public FileSinkOperator.RecordWriter getHiveRecordWriter(JobConf jc, Path finalOutPath, Class<? extends Writable> valueClass, boolean isCompressed, Properties tableProperties, Progressable progress) throws IOException {
final String segmentGranularity = tableProperties.getProperty(Constants.DRUID_SEGMENT_GRANULARITY) != null ? tableProperties.getProperty(Constants.DRUID_SEGMENT_GRANULARITY) : HiveConf.getVar(jc, HiveConf.ConfVars.HIVE_DRUID_INDEXING_GRANULARITY);
final String dataSource = tableProperties.getProperty(Constants.DRUID_DATA_SOURCE);
final String segmentDirectory = tableProperties.getProperty(Constants.DRUID_SEGMENT_DIRECTORY) != null ? tableProperties.getProperty(Constants.DRUID_SEGMENT_DIRECTORY) : HiveConf.getVar(jc, HiveConf.ConfVars.DRUID_SEGMENT_DIRECTORY);
final HdfsDataSegmentPusherConfig hdfsDataSegmentPusherConfig = new HdfsDataSegmentPusherConfig();
hdfsDataSegmentPusherConfig.setStorageDirectory(segmentDirectory);
final DataSegmentPusher hdfsDataSegmentPusher = new HdfsDataSegmentPusher(hdfsDataSegmentPusherConfig, jc, DruidStorageHandlerUtils.JSON_MAPPER);
final GranularitySpec granularitySpec = new UniformGranularitySpec(Granularity.valueOf(segmentGranularity), QueryGranularity.fromString(tableProperties.getProperty(Constants.DRUID_QUERY_GRANULARITY) == null ? "NONE" : tableProperties.getProperty(Constants.DRUID_QUERY_GRANULARITY)), null);
final String columnNameProperty = tableProperties.getProperty(serdeConstants.LIST_COLUMNS);
final String columnTypeProperty = tableProperties.getProperty(serdeConstants.LIST_COLUMN_TYPES);
if (StringUtils.isEmpty(columnNameProperty) || StringUtils.isEmpty(columnTypeProperty)) {
throw new IllegalStateException(String.format("List of columns names [%s] or columns type [%s] is/are not present", columnNameProperty, columnTypeProperty));
}
ArrayList<String> columnNames = new ArrayList<String>();
for (String name : columnNameProperty.split(",")) {
columnNames.add(name);
}
if (!columnNames.contains(DruidTable.DEFAULT_TIMESTAMP_COLUMN)) {
throw new IllegalStateException("Timestamp column (' " + DruidTable.DEFAULT_TIMESTAMP_COLUMN + "') not specified in create table; list of columns is : " + tableProperties.getProperty(serdeConstants.LIST_COLUMNS));
}
ArrayList<TypeInfo> columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty);
// Default, all columns that are not metrics or timestamp, are treated as dimensions
final List<DimensionSchema> dimensions = new ArrayList<>();
ImmutableList.Builder<AggregatorFactory> aggregatorFactoryBuilder = ImmutableList.builder();
for (int i = 0; i < columnTypes.size(); i++) {
PrimitiveTypeInfo f = (PrimitiveTypeInfo) columnTypes.get(i);
AggregatorFactory af;
switch(f.getPrimitiveCategory()) {
case BYTE:
case SHORT:
case INT:
case LONG:
af = new LongSumAggregatorFactory(columnNames.get(i), columnNames.get(i));
break;
case FLOAT:
case DOUBLE:
case DECIMAL:
af = new DoubleSumAggregatorFactory(columnNames.get(i), columnNames.get(i));
break;
case TIMESTAMP:
String tColumnName = columnNames.get(i);
if (!tColumnName.equals(DruidTable.DEFAULT_TIMESTAMP_COLUMN) && !tColumnName.equals(Constants.DRUID_TIMESTAMP_GRANULARITY_COL_NAME)) {
throw new IOException("Dimension " + tColumnName + " does not have STRING type: " + f.getPrimitiveCategory());
}
continue;
default:
// Dimension
String dColumnName = columnNames.get(i);
if (PrimitiveObjectInspectorUtils.getPrimitiveGrouping(f.getPrimitiveCategory()) != PrimitiveGrouping.STRING_GROUP) {
throw new IOException("Dimension " + dColumnName + " does not have STRING type: " + f.getPrimitiveCategory());
}
dimensions.add(new StringDimensionSchema(dColumnName));
continue;
}
aggregatorFactoryBuilder.add(af);
}
List<AggregatorFactory> aggregatorFactories = aggregatorFactoryBuilder.build();
final InputRowParser inputRowParser = new MapInputRowParser(new TimeAndDimsParseSpec(new TimestampSpec(DruidTable.DEFAULT_TIMESTAMP_COLUMN, "auto", null), new DimensionsSpec(dimensions, Lists.newArrayList(Constants.DRUID_TIMESTAMP_GRANULARITY_COL_NAME), null)));
Map<String, Object> inputParser = DruidStorageHandlerUtils.JSON_MAPPER.convertValue(inputRowParser, Map.class);
final DataSchema dataSchema = new DataSchema(Preconditions.checkNotNull(dataSource, "Data source name is null"), inputParser, aggregatorFactories.toArray(new AggregatorFactory[aggregatorFactories.size()]), granularitySpec, DruidStorageHandlerUtils.JSON_MAPPER);
final String workingPath = jc.get(Constants.DRUID_JOB_WORKING_DIRECTORY);
final String version = jc.get(Constants.DRUID_SEGMENT_VERSION);
Integer maxPartitionSize = HiveConf.getIntVar(jc, HiveConf.ConfVars.HIVE_DRUID_MAX_PARTITION_SIZE);
String basePersistDirectory = HiveConf.getVar(jc, HiveConf.ConfVars.HIVE_DRUID_BASE_PERSIST_DIRECTORY);
if (Strings.isNullOrEmpty(basePersistDirectory)) {
basePersistDirectory = System.getProperty("java.io.tmpdir");
}
Integer maxRowInMemory = HiveConf.getIntVar(jc, HiveConf.ConfVars.HIVE_DRUID_MAX_ROW_IN_MEMORY);
RealtimeTuningConfig realtimeTuningConfig = new RealtimeTuningConfig(maxRowInMemory, null, null, new File(basePersistDirectory, dataSource), new CustomVersioningPolicy(version), null, null, null, null, true, 0, 0, true, null);
LOG.debug(String.format("running with Data schema [%s] ", dataSchema));
return new DruidRecordWriter(dataSchema, realtimeTuningConfig, hdfsDataSegmentPusher, maxPartitionSize, new Path(workingPath, SEGMENTS_DESCRIPTOR_DIR_NAME), finalOutPath.getFileSystem(jc));
}
Aggregations