use of com.linkedin.pinot.common.data.StarTreeIndexSpec in project pinot by linkedin.
the class SegmentWithHllIndexCreateHelper method setupStarTreeConfig.
private void setupStarTreeConfig(SegmentGeneratorConfig segmentGenConfig) {
// StarTree related
segmentGenConfig.setEnableStarTreeIndex(true);
StarTreeIndexSpec starTreeIndexSpec = new StarTreeIndexSpec();
starTreeIndexSpec.setMaxLeafRecords(StarTreeIndexSpec.DEFAULT_MAX_LEAF_RECORDS);
segmentGenConfig.setStarTreeIndexSpec(starTreeIndexSpec);
LOGGER.info("segmentGenConfig Schema (w/o derived fields): ");
printSchema(segmentGenConfig.getSchema());
}
use of com.linkedin.pinot.common.data.StarTreeIndexSpec in project pinot by linkedin.
the class SegmentIndexCreationDriverImpl method buildStarTree.
private void buildStarTree() throws Exception {
// Create stats collector
SegmentPreIndexStatsCollectorImpl statsCollector = new SegmentPreIndexStatsCollectorImpl(dataSchema);
statsCollector.init();
segmentStats = statsCollector;
long start = System.currentTimeMillis();
//construct star tree builder config
StarTreeIndexSpec starTreeIndexSpec = config.getStarTreeIndexSpec();
if (starTreeIndexSpec == null) {
starTreeIndexSpec = new StarTreeIndexSpec();
starTreeIndexSpec.setMaxLeafRecords(StarTreeIndexSpec.DEFAULT_MAX_LEAF_RECORDS);
config.setStarTreeIndexSpec(starTreeIndexSpec);
}
List<String> dimensionsSplitOrder = starTreeIndexSpec.getDimensionsSplitOrder();
if (dimensionsSplitOrder != null && !dimensionsSplitOrder.isEmpty()) {
final String timeColumnName = config.getTimeColumnName();
if (timeColumnName != null) {
dimensionsSplitOrder.remove(timeColumnName);
}
}
//create star builder config from startreeindexspec. Merge these two in one later.
StarTreeBuilderConfig starTreeBuilderConfig = new StarTreeBuilderConfig();
starTreeBuilderConfig.setSchema(dataSchema);
starTreeBuilderConfig.setDimensionsSplitOrder(dimensionsSplitOrder);
starTreeBuilderConfig.setMaxLeafRecords(starTreeIndexSpec.getMaxLeafRecords());
starTreeBuilderConfig.setSkipStarNodeCreationForDimensions(starTreeIndexSpec.getSkipStarNodeCreationForDimensions());
Set<String> skipMaterializationForDimensions = starTreeIndexSpec.getskipMaterializationForDimensions();
starTreeBuilderConfig.setSkipMaterializationForDimensions(skipMaterializationForDimensions);
starTreeBuilderConfig.setSkipMaterializationCardinalityThreshold(starTreeIndexSpec.getskipMaterializationCardinalityThreshold());
starTreeBuilderConfig.setOutDir(starTreeTempDir);
boolean enableOffHeapFormat = starTreeIndexSpec.isEnableOffHeapFormat();
starTreeBuilderConfig.setEnableOffHealpFormat(enableOffHeapFormat);
//initialize star tree builder
StarTreeBuilder starTreeBuilder = new OffHeapStarTreeBuilder();
starTreeBuilder.init(starTreeBuilderConfig);
//build star tree along with collecting stats
recordReader.rewind();
LOGGER.info("Start append raw data to star tree builder!");
totalDocs = 0;
GenericRow readRow = new GenericRow();
GenericRow transformedRow = new GenericRow();
while (recordReader.hasNext()) {
//PlainFieldExtractor conducts necessary type conversions
transformedRow = readNextRowSanitized(readRow, transformedRow);
//must be called after previous step since type conversion for derived values is unnecessary
populateDefaultDerivedColumnValues(transformedRow);
starTreeBuilder.append(transformedRow);
statsCollector.collectRow(transformedRow);
totalRawDocs++;
totalDocs++;
}
recordReader.close();
LOGGER.info("Start building star tree!");
starTreeBuilder.build();
LOGGER.info("Finished building star tree!");
long starTreeBuildFinishTime = System.currentTimeMillis();
//build stats
// Count the number of documents and gather per-column statistics
LOGGER.info("Start building StatsCollector!");
Iterator<GenericRow> aggregatedRowsIterator = starTreeBuilder.iterator(starTreeBuilder.getTotalRawDocumentCount(), starTreeBuilder.getTotalRawDocumentCount() + starTreeBuilder.getTotalAggregateDocumentCount());
while (aggregatedRowsIterator.hasNext()) {
GenericRow genericRow = aggregatedRowsIterator.next();
statsCollector.collectRow(genericRow, true);
totalAggDocs++;
totalDocs++;
}
statsCollector.build();
buildIndexCreationInfo();
LOGGER.info("Collected stats for {} raw documents, {} aggregated documents", totalRawDocs, totalAggDocs);
long statCollectionFinishTime = System.currentTimeMillis();
// Initialize the index creation using the per-column statistics information
indexCreator.init(config, segmentIndexCreationInfo, indexCreationInfoMap, dataSchema, tempIndexDir);
//iterate over the data again,
Iterator<GenericRow> allRowsIterator = starTreeBuilder.iterator(0, starTreeBuilder.getTotalRawDocumentCount() + starTreeBuilder.getTotalAggregateDocumentCount());
while (allRowsIterator.hasNext()) {
GenericRow genericRow = allRowsIterator.next();
indexCreator.indexRow(genericRow);
}
// This is required so the dimensionsSplitOrder used by the builder can be written into the segment metadata.
if (dimensionsSplitOrder == null || dimensionsSplitOrder.isEmpty()) {
starTreeIndexSpec.setDimensionsSplitOrder(starTreeBuilder.getDimensionsSplitOrder());
}
if (skipMaterializationForDimensions == null || skipMaterializationForDimensions.isEmpty()) {
starTreeIndexSpec.setSkipMaterializationForDimensions(starTreeBuilder.getSkipMaterializationForDimensions());
}
serializeTree(starTreeBuilder, enableOffHeapFormat);
//post creation
handlePostCreation();
starTreeBuilder.cleanup();
long end = System.currentTimeMillis();
LOGGER.info("Total time:{} \n star tree build time:{} \n stat collection time:{} \n column index build time:{}", (end - start), (starTreeBuildFinishTime - start), statCollectionFinishTime - starTreeBuildFinishTime, end - statCollectionFinishTime);
}
use of com.linkedin.pinot.common.data.StarTreeIndexSpec in project pinot by linkedin.
the class SegmentColumnarIndexCreator method writeMetadata.
void writeMetadata() throws ConfigurationException {
PropertiesConfiguration properties = new PropertiesConfiguration(new File(file, V1Constants.MetadataKeys.METADATA_FILE_NAME));
properties.setProperty(SEGMENT_CREATOR_VERSION, config.getCreatorVersion());
properties.setProperty(SEGMENT_PADDING_CHARACTER, StringEscapeUtils.escapeJava(Character.toString(config.getPaddingCharacter())));
properties.setProperty(SEGMENT_NAME, segmentName);
properties.setProperty(TABLE_NAME, config.getTableName());
properties.setProperty(DIMENSIONS, config.getDimensions());
properties.setProperty(METRICS, config.getMetrics());
properties.setProperty(TIME_COLUMN_NAME, config.getTimeColumnName());
properties.setProperty(TIME_INTERVAL, "not_there");
properties.setProperty(SEGMENT_TOTAL_RAW_DOCS, String.valueOf(totalRawDocs));
properties.setProperty(SEGMENT_TOTAL_AGGREGATE_DOCS, String.valueOf(totalAggDocs));
properties.setProperty(SEGMENT_TOTAL_DOCS, String.valueOf(totalDocs));
properties.setProperty(STAR_TREE_ENABLED, String.valueOf(config.isEnableStarTreeIndex()));
properties.setProperty(SEGMENT_TOTAL_ERRORS, String.valueOf(totalErrors));
properties.setProperty(SEGMENT_TOTAL_NULLS, String.valueOf(totalNulls));
properties.setProperty(SEGMENT_TOTAL_CONVERSIONS, String.valueOf(totalConversions));
properties.setProperty(SEGMENT_TOTAL_NULL_COLS, String.valueOf(totalNullCols));
StarTreeIndexSpec starTreeIndexSpec = config.getStarTreeIndexSpec();
if (starTreeIndexSpec != null) {
properties.setProperty(STAR_TREE_SPLIT_ORDER, starTreeIndexSpec.getDimensionsSplitOrder());
properties.setProperty(STAR_TREE_MAX_LEAF_RECORDS, starTreeIndexSpec.getMaxLeafRecords());
properties.setProperty(STAR_TREE_SKIP_STAR_NODE_CREATION_FOR_DIMENSIONS, starTreeIndexSpec.getSkipStarNodeCreationForDimensions());
properties.setProperty(STAR_TREE_SKIP_MATERIALIZATION_CARDINALITY, starTreeIndexSpec.getskipMaterializationCardinalityThreshold());
properties.setProperty(STAR_TREE_SKIP_MATERIALIZATION_FOR_DIMENSIONS, starTreeIndexSpec.getskipMaterializationForDimensions());
}
HllConfig hllConfig = config.getHllConfig();
Map<String, String> derivedHllFieldToOriginMap = null;
if (hllConfig != null) {
properties.setProperty(SEGMENT_HLL_LOG2M, hllConfig.getHllLog2m());
derivedHllFieldToOriginMap = hllConfig.getDerivedHllFieldToOriginMap();
}
String timeColumn = config.getTimeColumnName();
if (indexCreationInfoMap.get(timeColumn) != null) {
properties.setProperty(SEGMENT_START_TIME, indexCreationInfoMap.get(timeColumn).getMin());
properties.setProperty(SEGMENT_END_TIME, indexCreationInfoMap.get(timeColumn).getMax());
properties.setProperty(TIME_UNIT, config.getSegmentTimeUnit());
}
if (config.containsCustomProperty(SEGMENT_START_TIME)) {
properties.setProperty(SEGMENT_START_TIME, config.getStartTime());
}
if (config.containsCustomProperty(SEGMENT_END_TIME)) {
properties.setProperty(SEGMENT_END_TIME, config.getEndTime());
}
if (config.containsCustomProperty(TIME_UNIT)) {
properties.setProperty(TIME_UNIT, config.getSegmentTimeUnit());
}
for (Map.Entry<String, String> entry : config.getCustomProperties().entrySet()) {
properties.setProperty(entry.getKey(), entry.getValue());
}
for (Map.Entry<String, ColumnIndexCreationInfo> entry : indexCreationInfoMap.entrySet()) {
String column = entry.getKey();
ColumnIndexCreationInfo columnIndexCreationInfo = entry.getValue();
SegmentDictionaryCreator dictionaryCreator = dictionaryCreatorMap.get(column);
int dictionaryElementSize = (dictionaryCreator != null) ? dictionaryCreator.getStringColumnMaxLength() : 0;
// TODO: after fixing the server-side dependency on HAS_INVERTED_INDEX and deployed, set HAS_INVERTED_INDEX properly
// The hasInvertedIndex flag in segment metadata is picked up in ColumnMetadata, and will be used during the query
// plan phase. If it is set to false, then inverted indexes are not used in queries even if they are created via table
// configs on segment load. So, we set it to true here for now, until we fix the server to update the value inside
// ColumnMetadata, export information to the query planner that the inverted index available is current and can be used.
//
// boolean hasInvertedIndex = invertedIndexCreatorMap.containsKey();
boolean hasInvertedIndex = true;
String hllOriginColumn = null;
if (derivedHllFieldToOriginMap != null) {
hllOriginColumn = derivedHllFieldToOriginMap.get(column);
}
addColumnMetadataInfo(properties, column, columnIndexCreationInfo, totalDocs, totalRawDocs, totalAggDocs, schema.getFieldSpecFor(column), dictionaryCreatorMap.containsKey(column), dictionaryElementSize, hasInvertedIndex, hllOriginColumn);
}
properties.save();
}
use of com.linkedin.pinot.common.data.StarTreeIndexSpec in project pinot by linkedin.
the class BaseClusterIntegrationTest method buildSegmentsFromAvro.
public static Future<Map<File, File>> buildSegmentsFromAvro(final List<File> avroFiles, Executor executor, int baseSegmentIndex, final File baseDirectory, final File segmentTarDir, final String tableName, final boolean createStarTreeIndex, final com.linkedin.pinot.common.data.Schema inputPinotSchema) {
int segmentCount = avroFiles.size();
LOGGER.info("Building " + segmentCount + " segments in parallel");
List<ListenableFutureTask<Pair<File, File>>> futureTasks = new ArrayList<ListenableFutureTask<Pair<File, File>>>();
for (int i = 1; i <= segmentCount; ++i) {
final int segmentIndex = i - 1;
final int segmentNumber = i + baseSegmentIndex;
final ListenableFutureTask<Pair<File, File>> buildSegmentFutureTask = ListenableFutureTask.<Pair<File, File>>create(new Callable<Pair<File, File>>() {
@Override
public Pair<File, File> call() throws Exception {
try {
// Build segment
LOGGER.info("Starting to build segment " + segmentNumber);
File outputDir = new File(baseDirectory, "segment-" + segmentNumber);
final File inputAvroFile = avroFiles.get(segmentIndex);
final SegmentGeneratorConfig genConfig = SegmentTestUtils.getSegmentGenSpecWithSchemAndProjectedColumns(inputAvroFile, outputDir, TimeUnit.DAYS, tableName, inputPinotSchema);
if (inputPinotSchema != null) {
genConfig.setSchema(inputPinotSchema);
}
// jfim: We add a space and a special character to do a regression test for PINOT-3296 Segments with spaces
// in their filename don't work properly
genConfig.setSegmentNamePostfix(Integer.toString(segmentNumber) + " %");
genConfig.setEnableStarTreeIndex(createStarTreeIndex);
// Enable off heap star tree format in the integration test.
StarTreeIndexSpec starTreeIndexSpec = null;
if (createStarTreeIndex) {
starTreeIndexSpec = new StarTreeIndexSpec();
starTreeIndexSpec.setEnableOffHeapFormat(true);
}
genConfig.setStarTreeIndexSpec(starTreeIndexSpec);
final SegmentIndexCreationDriver driver = SegmentCreationDriverFactory.get(null);
driver.init(genConfig);
driver.build();
// Tar segment
String segmentName = outputDir.list()[0];
final String tarGzPath = TarGzCompressionUtils.createTarGzOfDirectory(outputDir.getAbsolutePath() + "/" + segmentName, new File(segmentTarDir, segmentName).getAbsolutePath());
LOGGER.info("Completed segment " + segmentNumber + " : " + segmentName + " from file " + inputAvroFile.getName());
return new ImmutablePair<File, File>(inputAvroFile, new File(tarGzPath));
} catch (Exception e) {
LOGGER.error("Exception while building segment input: {} output {} ", avroFiles.get(segmentIndex), "segment-" + segmentNumber);
throw new RuntimeException(e);
}
}
});
futureTasks.add(buildSegmentFutureTask);
executor.execute(buildSegmentFutureTask);
}
ListenableFuture<List<Pair<File, File>>> pairListFuture = Futures.allAsList(futureTasks);
return Futures.transform(pairListFuture, new AsyncFunction<List<Pair<File, File>>, Map<File, File>>() {
@Override
public ListenableFuture<Map<File, File>> apply(List<Pair<File, File>> input) throws Exception {
Map<File, File> avroToSegmentMap = new HashMap<File, File>();
for (Pair<File, File> avroToSegmentPair : input) {
avroToSegmentMap.put(avroToSegmentPair.getLeft(), avroToSegmentPair.getRight());
}
return Futures.immediateFuture(avroToSegmentMap);
}
});
}
use of com.linkedin.pinot.common.data.StarTreeIndexSpec in project pinot by linkedin.
the class TestStarTreeMetadata method setupSegment.
private void setupSegment(File segmentDir) throws Exception {
final String filePath = TestUtils.getFileFromResourceUrl(getClass().getClassLoader().getResource(AVRO_DATA));
if (segmentDir.exists()) {
FileUtils.deleteQuietly(segmentDir);
}
final SegmentGeneratorConfig config = SegmentTestUtils.getSegmentGenSpecWithSchemAndProjectedColumns(new File(filePath), segmentDir, "time_day", TimeUnit.DAYS, TABLE_NAME);
config.setTableName(TABLE_NAME);
config.setSegmentName(SEGMENT_NAME);
StarTreeIndexSpec starTreeIndexSpec = new StarTreeIndexSpec();
starTreeIndexSpec.setDimensionsSplitOrder(DIMENSIONS_SPLIT_ORDER);
starTreeIndexSpec.setMaxLeafRecords(MAX_LEAF_RECORDS);
starTreeIndexSpec.setSkipMaterializationCardinalityThreshold(SKIP_CARDINALITY_THRESHOLD);
starTreeIndexSpec.setSkipStarNodeCreationForDimensions(SKIP_STAR_NODE_CREATION_DIMENSTIONS);
starTreeIndexSpec.setSkipMaterializationForDimensions(SKIP_MATERIALIZATION_DIMENSIONS);
config.setEnableStarTreeIndex(true);
config.setStarTreeIndexSpec(starTreeIndexSpec);
final SegmentIndexCreationDriver driver = SegmentCreationDriverFactory.get(null);
driver.init(config);
driver.build();
}
Aggregations