use of com.linkedin.pinot.core.segment.creator.impl.stats.SegmentPreIndexStatsCollectorImpl in project pinot by linkedin.
the class SegmentIndexCreationDriverImpl method buildStarTree.
private void buildStarTree() throws Exception {
// Create stats collector
SegmentPreIndexStatsCollectorImpl statsCollector = new SegmentPreIndexStatsCollectorImpl(dataSchema);
statsCollector.init();
segmentStats = statsCollector;
long start = System.currentTimeMillis();
//construct star tree builder config
StarTreeIndexSpec starTreeIndexSpec = config.getStarTreeIndexSpec();
if (starTreeIndexSpec == null) {
starTreeIndexSpec = new StarTreeIndexSpec();
starTreeIndexSpec.setMaxLeafRecords(StarTreeIndexSpec.DEFAULT_MAX_LEAF_RECORDS);
config.setStarTreeIndexSpec(starTreeIndexSpec);
}
List<String> dimensionsSplitOrder = starTreeIndexSpec.getDimensionsSplitOrder();
if (dimensionsSplitOrder != null && !dimensionsSplitOrder.isEmpty()) {
final String timeColumnName = config.getTimeColumnName();
if (timeColumnName != null) {
dimensionsSplitOrder.remove(timeColumnName);
}
}
//create star builder config from startreeindexspec. Merge these two in one later.
StarTreeBuilderConfig starTreeBuilderConfig = new StarTreeBuilderConfig();
starTreeBuilderConfig.setSchema(dataSchema);
starTreeBuilderConfig.setDimensionsSplitOrder(dimensionsSplitOrder);
starTreeBuilderConfig.setMaxLeafRecords(starTreeIndexSpec.getMaxLeafRecords());
starTreeBuilderConfig.setSkipStarNodeCreationForDimensions(starTreeIndexSpec.getSkipStarNodeCreationForDimensions());
Set<String> skipMaterializationForDimensions = starTreeIndexSpec.getskipMaterializationForDimensions();
starTreeBuilderConfig.setSkipMaterializationForDimensions(skipMaterializationForDimensions);
starTreeBuilderConfig.setSkipMaterializationCardinalityThreshold(starTreeIndexSpec.getskipMaterializationCardinalityThreshold());
starTreeBuilderConfig.setOutDir(starTreeTempDir);
boolean enableOffHeapFormat = starTreeIndexSpec.isEnableOffHeapFormat();
starTreeBuilderConfig.setEnableOffHealpFormat(enableOffHeapFormat);
//initialize star tree builder
StarTreeBuilder starTreeBuilder = new OffHeapStarTreeBuilder();
starTreeBuilder.init(starTreeBuilderConfig);
//build star tree along with collecting stats
recordReader.rewind();
LOGGER.info("Start append raw data to star tree builder!");
totalDocs = 0;
GenericRow readRow = new GenericRow();
GenericRow transformedRow = new GenericRow();
while (recordReader.hasNext()) {
//PlainFieldExtractor conducts necessary type conversions
transformedRow = readNextRowSanitized(readRow, transformedRow);
//must be called after previous step since type conversion for derived values is unnecessary
populateDefaultDerivedColumnValues(transformedRow);
starTreeBuilder.append(transformedRow);
statsCollector.collectRow(transformedRow);
totalRawDocs++;
totalDocs++;
}
recordReader.close();
LOGGER.info("Start building star tree!");
starTreeBuilder.build();
LOGGER.info("Finished building star tree!");
long starTreeBuildFinishTime = System.currentTimeMillis();
//build stats
// Count the number of documents and gather per-column statistics
LOGGER.info("Start building StatsCollector!");
Iterator<GenericRow> aggregatedRowsIterator = starTreeBuilder.iterator(starTreeBuilder.getTotalRawDocumentCount(), starTreeBuilder.getTotalRawDocumentCount() + starTreeBuilder.getTotalAggregateDocumentCount());
while (aggregatedRowsIterator.hasNext()) {
GenericRow genericRow = aggregatedRowsIterator.next();
statsCollector.collectRow(genericRow, true);
totalAggDocs++;
totalDocs++;
}
statsCollector.build();
buildIndexCreationInfo();
LOGGER.info("Collected stats for {} raw documents, {} aggregated documents", totalRawDocs, totalAggDocs);
long statCollectionFinishTime = System.currentTimeMillis();
// Initialize the index creation using the per-column statistics information
indexCreator.init(config, segmentIndexCreationInfo, indexCreationInfoMap, dataSchema, tempIndexDir);
//iterate over the data again,
Iterator<GenericRow> allRowsIterator = starTreeBuilder.iterator(0, starTreeBuilder.getTotalRawDocumentCount() + starTreeBuilder.getTotalAggregateDocumentCount());
while (allRowsIterator.hasNext()) {
GenericRow genericRow = allRowsIterator.next();
indexCreator.indexRow(genericRow);
}
// This is required so the dimensionsSplitOrder used by the builder can be written into the segment metadata.
if (dimensionsSplitOrder == null || dimensionsSplitOrder.isEmpty()) {
starTreeIndexSpec.setDimensionsSplitOrder(starTreeBuilder.getDimensionsSplitOrder());
}
if (skipMaterializationForDimensions == null || skipMaterializationForDimensions.isEmpty()) {
starTreeIndexSpec.setSkipMaterializationForDimensions(starTreeBuilder.getSkipMaterializationForDimensions());
}
serializeTree(starTreeBuilder, enableOffHeapFormat);
//post creation
handlePostCreation();
starTreeBuilder.cleanup();
long end = System.currentTimeMillis();
LOGGER.info("Total time:{} \n star tree build time:{} \n stat collection time:{} \n column index build time:{}", (end - start), (starTreeBuildFinishTime - start), statCollectionFinishTime - starTreeBuildFinishTime, end - statCollectionFinishTime);
}
use of com.linkedin.pinot.core.segment.creator.impl.stats.SegmentPreIndexStatsCollectorImpl in project pinot by linkedin.
the class RecordReaderSegmentCreationDataSource method gatherStats.
@Override
public SegmentPreIndexStatsCollector gatherStats(FieldExtractor fieldExtractor) {
try {
SegmentPreIndexStatsCollector collector = new SegmentPreIndexStatsCollectorImpl(_recordReader.getSchema());
collector.init();
// Gather the stats
GenericRow readRow = new GenericRow();
GenericRow transformedRow = new GenericRow();
while (_recordReader.hasNext()) {
transformedRow = readNextRowSanitized(readRow, transformedRow, fieldExtractor);
collector.collectRow(transformedRow);
}
collector.build();
return collector;
} catch (Exception e) {
LOGGER.error("Caught exception while gathering stats", e);
Utils.rethrowException(e);
return null;
}
}
Aggregations