use of org.apache.druid.segment.IndexSpec in project druid by druid-io.
the class IndexMergeBenchmark method setup.
@Setup
public void setup() throws IOException {
log.info("SETUP CALLED AT " + System.currentTimeMillis());
indexMergerV9 = new IndexMergerV9(JSON_MAPPER, INDEX_IO, getSegmentWriteOutMediumFactory(factoryType));
ComplexMetrics.registerSerde("hyperUnique", new HyperUniquesSerde());
indexesToMerge = new ArrayList<>();
schemaInfo = GeneratorBasicSchemas.SCHEMA_MAP.get(schema);
for (int i = 0; i < numSegments; i++) {
DataGenerator gen = new DataGenerator(schemaInfo.getColumnSchemas(), RNG_SEED + i, schemaInfo.getDataInterval(), rowsPerSegment);
IncrementalIndex incIndex = makeIncIndex();
gen.addToIndex(incIndex, rowsPerSegment);
tmpDir = FileUtils.createTempDir();
log.info("Using temp dir: " + tmpDir.getAbsolutePath());
File indexFile = indexMergerV9.persist(incIndex, tmpDir, new IndexSpec(), null);
QueryableIndex qIndex = INDEX_IO.loadIndex(indexFile);
indexesToMerge.add(qIndex);
}
}
use of org.apache.druid.segment.IndexSpec in project druid by druid-io.
the class IndexPersistBenchmark method persistV9.
@Benchmark
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.MICROSECONDS)
public void persistV9(Blackhole blackhole) throws Exception {
File indexFile = INDEX_MERGER_V9.persist(incIndex, tmpDir, new IndexSpec(), null);
blackhole.consume(indexFile);
}
use of org.apache.druid.segment.IndexSpec in project hive by apache.
the class DruidStorageHandler method updateKafkaIngestion.
private void updateKafkaIngestion(Table table) {
final String overlordAddress = HiveConf.getVar(getConf(), HiveConf.ConfVars.HIVE_DRUID_OVERLORD_DEFAULT_ADDRESS);
final String dataSourceName = Preconditions.checkNotNull(DruidStorageHandlerUtils.getTableProperty(table, Constants.DRUID_DATA_SOURCE), "Druid datasource name is null");
final String kafkaTopic = Preconditions.checkNotNull(DruidStorageHandlerUtils.getTableProperty(table, DruidConstants.KAFKA_TOPIC), "kafka topic is null");
final String kafkaServers = Preconditions.checkNotNull(DruidStorageHandlerUtils.getTableProperty(table, DruidConstants.KAFKA_BOOTSTRAP_SERVERS), "kafka connect string is null");
Properties tableProperties = new Properties();
tableProperties.putAll(table.getParameters());
final GranularitySpec granularitySpec = DruidStorageHandlerUtils.getGranularitySpec(getConf(), tableProperties);
List<FieldSchema> columns = table.getSd().getCols();
List<String> columnNames = new ArrayList<>(columns.size());
List<TypeInfo> columnTypes = new ArrayList<>(columns.size());
for (FieldSchema schema : columns) {
columnNames.add(schema.getName());
columnTypes.add(TypeInfoUtils.getTypeInfoFromTypeString(schema.getType()));
}
Pair<List<DimensionSchema>, AggregatorFactory[]> dimensionsAndAggregates = DruidStorageHandlerUtils.getDimensionsAndAggregates(columnNames, columnTypes);
if (!columnNames.contains(DruidConstants.DEFAULT_TIMESTAMP_COLUMN)) {
throw new IllegalStateException("Timestamp column (' " + DruidConstants.DEFAULT_TIMESTAMP_COLUMN + "') not specified in create table; list of columns is : " + columnNames);
}
DimensionsSpec dimensionsSpec = new DimensionsSpec(dimensionsAndAggregates.lhs, null, null);
String timestampFormat = DruidStorageHandlerUtils.getTableProperty(table, DruidConstants.DRUID_TIMESTAMP_FORMAT);
String timestampColumnName = DruidStorageHandlerUtils.getTableProperty(table, DruidConstants.DRUID_TIMESTAMP_COLUMN);
if (timestampColumnName == null) {
timestampColumnName = DruidConstants.DEFAULT_TIMESTAMP_COLUMN;
}
final TimestampSpec timestampSpec = new TimestampSpec(timestampColumnName, timestampFormat, null);
final InputRowParser inputRowParser = DruidKafkaUtils.getInputRowParser(table, timestampSpec, dimensionsSpec);
final Map<String, Object> inputParser = JSON_MAPPER.convertValue(inputRowParser, new TypeReference<Map<String, Object>>() {
});
final DataSchema dataSchema = new DataSchema(dataSourceName, inputParser, dimensionsAndAggregates.rhs, granularitySpec, null, DruidStorageHandlerUtils.JSON_MAPPER);
IndexSpec indexSpec = DruidStorageHandlerUtils.getIndexSpec(getConf());
KafkaSupervisorSpec spec = DruidKafkaUtils.createKafkaSupervisorSpec(table, kafkaTopic, kafkaServers, dataSchema, indexSpec);
// Fetch existing Ingestion Spec from Druid, if any
KafkaSupervisorSpec existingSpec = fetchKafkaIngestionSpec(table);
String targetState = DruidStorageHandlerUtils.getTableProperty(table, DruidConstants.DRUID_KAFKA_INGESTION);
if (targetState == null) {
// Case when user has not specified any ingestion state in the current command
// if there is a kafka supervisor running then keep it last known state is START otherwise STOP.
targetState = existingSpec == null ? "STOP" : "START";
}
if ("STOP".equalsIgnoreCase(targetState)) {
if (existingSpec != null) {
stopKafkaIngestion(overlordAddress, dataSourceName);
}
} else if ("START".equalsIgnoreCase(targetState)) {
if (existingSpec == null || !existingSpec.equals(spec)) {
DruidKafkaUtils.updateKafkaIngestionSpec(overlordAddress, spec);
}
} else if ("RESET".equalsIgnoreCase(targetState)) {
// Case when there are changes in multiple table properties.
if (existingSpec != null && !existingSpec.equals(spec)) {
DruidKafkaUtils.updateKafkaIngestionSpec(overlordAddress, spec);
}
resetKafkaIngestion(overlordAddress, dataSourceName);
} else {
throw new IllegalArgumentException(String.format("Invalid value for property [%s], Valid values are [START, STOP, RESET]", DruidConstants.DRUID_KAFKA_INGESTION));
}
// We do not want to keep state in two separate places so remove from hive table properties.
table.getParameters().remove(DruidConstants.DRUID_KAFKA_INGESTION);
}
use of org.apache.druid.segment.IndexSpec in project hive by apache.
the class DruidOutputFormat method getHiveRecordWriter.
@Override
public FileSinkOperator.RecordWriter getHiveRecordWriter(JobConf jc, Path finalOutPath, Class<? extends Writable> valueClass, boolean isCompressed, Properties tableProperties, Progressable progress) throws IOException {
final int targetNumShardsPerGranularity = Integer.parseUnsignedInt(tableProperties.getProperty(Constants.DRUID_TARGET_SHARDS_PER_GRANULARITY, "0"));
final int maxPartitionSize = targetNumShardsPerGranularity > 0 ? -1 : HiveConf.getIntVar(jc, HiveConf.ConfVars.HIVE_DRUID_MAX_PARTITION_SIZE);
// If datasource is in the table properties, it is an INSERT/INSERT OVERWRITE as the datasource
// name was already persisted. Otherwise, it is a CT/CTAS and we need to get the name from the
// job properties that are set by configureOutputJobProperties in the DruidStorageHandler
final String dataSource = tableProperties.getProperty(Constants.DRUID_DATA_SOURCE) == null ? jc.get(Constants.DRUID_DATA_SOURCE) : tableProperties.getProperty(Constants.DRUID_DATA_SOURCE);
final String segmentDirectory = jc.get(DruidConstants.DRUID_SEGMENT_INTERMEDIATE_DIRECTORY);
final GranularitySpec granularitySpec = DruidStorageHandlerUtils.getGranularitySpec(jc, tableProperties);
final String columnNameProperty = tableProperties.getProperty(serdeConstants.LIST_COLUMNS);
final String columnTypeProperty = tableProperties.getProperty(serdeConstants.LIST_COLUMN_TYPES);
if (StringUtils.isEmpty(columnNameProperty) || StringUtils.isEmpty(columnTypeProperty)) {
throw new IllegalStateException(String.format("List of columns names [%s] or columns type [%s] is/are not present", columnNameProperty, columnTypeProperty));
}
ArrayList<String> columnNames = Lists.newArrayList(columnNameProperty.split(","));
if (!columnNames.contains(DruidConstants.DEFAULT_TIMESTAMP_COLUMN)) {
throw new IllegalStateException("Timestamp column (' " + DruidConstants.DEFAULT_TIMESTAMP_COLUMN + "') not specified in create table; list of columns is : " + tableProperties.getProperty(serdeConstants.LIST_COLUMNS));
}
ArrayList<TypeInfo> columnTypes = TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty);
Pair<List<DimensionSchema>, AggregatorFactory[]> dimensionsAndAggregates = DruidStorageHandlerUtils.getDimensionsAndAggregates(columnNames, columnTypes);
final InputRowParser inputRowParser = new MapInputRowParser(new TimeAndDimsParseSpec(new TimestampSpec(DruidConstants.DEFAULT_TIMESTAMP_COLUMN, "auto", null), new DimensionsSpec(dimensionsAndAggregates.lhs, Lists.newArrayList(Constants.DRUID_TIMESTAMP_GRANULARITY_COL_NAME, Constants.DRUID_SHARD_KEY_COL_NAME), null)));
Map<String, Object> inputParser = DruidStorageHandlerUtils.JSON_MAPPER.convertValue(inputRowParser, new TypeReference<Map<String, Object>>() {
});
final DataSchema dataSchema = new DataSchema(Preconditions.checkNotNull(dataSource, "Data source name is null"), inputParser, dimensionsAndAggregates.rhs, granularitySpec, null, DruidStorageHandlerUtils.JSON_MAPPER);
final String workingPath = jc.get(DruidConstants.DRUID_JOB_WORKING_DIRECTORY);
final String version = jc.get(DruidConstants.DRUID_SEGMENT_VERSION);
String basePersistDirectory = HiveConf.getVar(jc, HiveConf.ConfVars.HIVE_DRUID_BASE_PERSIST_DIRECTORY);
if (Strings.isNullOrEmpty(basePersistDirectory)) {
basePersistDirectory = System.getProperty("java.io.tmpdir");
}
Integer maxRowInMemory = HiveConf.getIntVar(jc, HiveConf.ConfVars.HIVE_DRUID_MAX_ROW_IN_MEMORY);
IndexSpec indexSpec = DruidStorageHandlerUtils.getIndexSpec(jc);
RealtimeTuningConfig realtimeTuningConfig = new RealtimeTuningConfig(maxRowInMemory, null, null, null, new File(basePersistDirectory, dataSource), new CustomVersioningPolicy(version), null, null, null, indexSpec, null, true, 0, 0, true, null, 0L, null, null);
LOG.debug(String.format("running with Data schema [%s] ", dataSchema));
return new DruidRecordWriter(dataSchema, realtimeTuningConfig, DruidStorageHandlerUtils.createSegmentPusherForDirectory(segmentDirectory, jc), maxPartitionSize, new Path(workingPath, SEGMENTS_DESCRIPTOR_DIR_NAME), finalOutPath.getFileSystem(jc));
}
use of org.apache.druid.segment.IndexSpec in project druid by druid-io.
the class FilterPartitionBenchmark method setup.
@Setup
public void setup() throws IOException {
log.info("SETUP CALLED AT " + System.currentTimeMillis());
ComplexMetrics.registerSerde("hyperUnique", new HyperUniquesSerde());
schemaInfo = GeneratorBasicSchemas.SCHEMA_MAP.get(schema);
DataGenerator gen = new DataGenerator(schemaInfo.getColumnSchemas(), RNG_SEED, schemaInfo.getDataInterval(), rowsPerSegment);
incIndex = makeIncIndex();
for (int j = 0; j < rowsPerSegment; j++) {
InputRow row = gen.nextRow();
if (j % 10000 == 0) {
log.info(j + " rows generated.");
}
incIndex.add(row);
}
tmpDir = FileUtils.createTempDir();
log.info("Using temp dir: " + tmpDir.getAbsolutePath());
indexFile = INDEX_MERGER_V9.persist(incIndex, tmpDir, new IndexSpec(), null);
qIndex = INDEX_IO.loadIndex(indexFile);
Interval interval = schemaInfo.getDataInterval();
timeFilterNone = new BoundFilter(new BoundDimFilter(ColumnHolder.TIME_COLUMN_NAME, String.valueOf(Long.MAX_VALUE), String.valueOf(Long.MAX_VALUE), true, true, null, null, StringComparators.ALPHANUMERIC));
long halfEnd = (interval.getEndMillis() + interval.getStartMillis()) / 2;
timeFilterHalf = new BoundFilter(new BoundDimFilter(ColumnHolder.TIME_COLUMN_NAME, String.valueOf(interval.getStartMillis()), String.valueOf(halfEnd), true, true, null, null, StringComparators.ALPHANUMERIC));
timeFilterAll = new BoundFilter(new BoundDimFilter(ColumnHolder.TIME_COLUMN_NAME, String.valueOf(interval.getStartMillis()), String.valueOf(interval.getEndMillis()), true, true, null, null, StringComparators.ALPHANUMERIC));
}
Aggregations