use of com.linkedin.pinot.core.startree.OffHeapStarTreeBuilder in project pinot by linkedin.
the class OffheapStarTreeBuilderWithHllFieldTest method testSimpleCore.
private void testSimpleCore(int numDimensions, int numMetrics, int numSkipMaterializationDimensions, int[] memberIdColumnValues, long preciseCardinality) throws Exception {
StarTreeBuilderConfig builderConfig = null;
try {
builderConfig = new StarTreeBuilderConfig();
Schema schema = new Schema();
builderConfig.dimensionsSplitOrder = new ArrayList<>();
builderConfig.setSkipMaterializationForDimensions(new HashSet<String>());
Set<String> skipMaterializationForDimensions = builderConfig.getSkipMaterializationForDimensions();
// add member id dimension spec
String dimName = memberIdFieldName;
DimensionFieldSpec dimensionFieldSpec = new DimensionFieldSpec(dimName, DataType.INT, true);
schema.addField(dimensionFieldSpec);
// add other dimension specs
for (int i = 1; i < numDimensions; i++) {
dimName = "d" + (i + 1);
dimensionFieldSpec = new DimensionFieldSpec(dimName, DataType.STRING, true);
schema.addField(dimensionFieldSpec);
if (i < (numDimensions - numSkipMaterializationDimensions)) {
builderConfig.dimensionsSplitOrder.add(dimName);
} else {
builderConfig.getSkipMaterializationForDimensions().add(dimName);
}
}
schema.setTimeFieldSpec(new TimeFieldSpec("daysSinceEpoch", DataType.INT, TimeUnit.DAYS));
// add other metric specs
for (int i = 0; i < numMetrics - 1; i++) {
String metricName = "m" + (i + 1);
MetricFieldSpec metricFieldSpec = new MetricFieldSpec(metricName, DataType.INT);
schema.addField(metricFieldSpec);
}
// add hll metric
String hllMetricName = memberIdFieldName + hllDeriveFieldSuffix;
MetricFieldSpec hllDerivedFieldSpec = new MetricFieldSpec(hllMetricName, FieldSpec.DataType.STRING, HllUtil.getHllFieldSizeFromLog2m(log2m), MetricFieldSpec.DerivedMetricType.HLL);
schema.addField(hllDerivedFieldSpec);
//
builderConfig.maxLeafRecords = 10;
builderConfig.schema = schema;
builderConfig.setOutDir(new File("/tmp/startree"));
//
OffHeapStarTreeBuilder builder = new OffHeapStarTreeBuilder();
builder.init(builderConfig);
// fill values
HashMap<String, Object> map = new HashMap<>();
for (int row = 0; row < memberIdColumnValues.length; row++) {
// add member id column
dimName = memberIdFieldName;
map.put(dimName, memberIdColumnValues[row]);
// add other dimensions
for (int i = 1; i < numDimensions; i++) {
dimName = schema.getDimensionFieldSpecs().get(i).getName();
map.put(dimName, dimName + "-v" + row % (numDimensions - i));
}
// add time column
map.put("daysSinceEpoch", 1);
// add other metrics
for (int i = 0; i < numMetrics - 1; i++) {
String metName = schema.getMetricFieldSpecs().get(i).getName();
map.put(metName, 1);
}
// add hll column value
map.put(hllMetricName, HllUtil.singleValueHllAsString(log2m, memberIdColumnValues[row]));
//
GenericRow genericRow = new GenericRow();
genericRow.init(map);
builder.append(genericRow);
}
builder.build();
int totalDocs = builder.getTotalRawDocumentCount() + builder.getTotalAggregateDocumentCount();
Iterator<GenericRow> iterator = builder.iterator(0, totalDocs);
while (iterator.hasNext()) {
GenericRow row = iterator.next();
LOGGER.info(HllUtil.inspectGenericRow(row, hllDeriveFieldSuffix));
}
iterator = builder.iterator(builder.getTotalRawDocumentCount(), totalDocs);
GenericRow lastRow = null;
while (iterator.hasNext()) {
GenericRow row = iterator.next();
for (String skipDimension : skipMaterializationForDimensions) {
String rowValue = (String) row.getValue(skipDimension);
assert (rowValue.equals("ALL"));
}
lastRow = row;
}
assertApproximation(HllUtil.convertStringToHll((String) lastRow.getValue(hllMetricName)).cardinality(), preciseCardinality, 0.1);
} finally {
if (builderConfig != null) {
FileUtils.deleteDirectory(builderConfig.getOutDir());
}
}
}
use of com.linkedin.pinot.core.startree.OffHeapStarTreeBuilder in project pinot by linkedin.
the class SegmentIndexCreationDriverImpl method buildStarTree.
private void buildStarTree() throws Exception {
// Create stats collector
SegmentPreIndexStatsCollectorImpl statsCollector = new SegmentPreIndexStatsCollectorImpl(dataSchema);
statsCollector.init();
segmentStats = statsCollector;
long start = System.currentTimeMillis();
//construct star tree builder config
StarTreeIndexSpec starTreeIndexSpec = config.getStarTreeIndexSpec();
if (starTreeIndexSpec == null) {
starTreeIndexSpec = new StarTreeIndexSpec();
starTreeIndexSpec.setMaxLeafRecords(StarTreeIndexSpec.DEFAULT_MAX_LEAF_RECORDS);
config.setStarTreeIndexSpec(starTreeIndexSpec);
}
List<String> dimensionsSplitOrder = starTreeIndexSpec.getDimensionsSplitOrder();
if (dimensionsSplitOrder != null && !dimensionsSplitOrder.isEmpty()) {
final String timeColumnName = config.getTimeColumnName();
if (timeColumnName != null) {
dimensionsSplitOrder.remove(timeColumnName);
}
}
//create star builder config from startreeindexspec. Merge these two in one later.
StarTreeBuilderConfig starTreeBuilderConfig = new StarTreeBuilderConfig();
starTreeBuilderConfig.setSchema(dataSchema);
starTreeBuilderConfig.setDimensionsSplitOrder(dimensionsSplitOrder);
starTreeBuilderConfig.setMaxLeafRecords(starTreeIndexSpec.getMaxLeafRecords());
starTreeBuilderConfig.setSkipStarNodeCreationForDimensions(starTreeIndexSpec.getSkipStarNodeCreationForDimensions());
Set<String> skipMaterializationForDimensions = starTreeIndexSpec.getskipMaterializationForDimensions();
starTreeBuilderConfig.setSkipMaterializationForDimensions(skipMaterializationForDimensions);
starTreeBuilderConfig.setSkipMaterializationCardinalityThreshold(starTreeIndexSpec.getskipMaterializationCardinalityThreshold());
starTreeBuilderConfig.setOutDir(starTreeTempDir);
boolean enableOffHeapFormat = starTreeIndexSpec.isEnableOffHeapFormat();
starTreeBuilderConfig.setEnableOffHealpFormat(enableOffHeapFormat);
//initialize star tree builder
StarTreeBuilder starTreeBuilder = new OffHeapStarTreeBuilder();
starTreeBuilder.init(starTreeBuilderConfig);
//build star tree along with collecting stats
recordReader.rewind();
LOGGER.info("Start append raw data to star tree builder!");
totalDocs = 0;
GenericRow readRow = new GenericRow();
GenericRow transformedRow = new GenericRow();
while (recordReader.hasNext()) {
//PlainFieldExtractor conducts necessary type conversions
transformedRow = readNextRowSanitized(readRow, transformedRow);
//must be called after previous step since type conversion for derived values is unnecessary
populateDefaultDerivedColumnValues(transformedRow);
starTreeBuilder.append(transformedRow);
statsCollector.collectRow(transformedRow);
totalRawDocs++;
totalDocs++;
}
recordReader.close();
LOGGER.info("Start building star tree!");
starTreeBuilder.build();
LOGGER.info("Finished building star tree!");
long starTreeBuildFinishTime = System.currentTimeMillis();
//build stats
// Count the number of documents and gather per-column statistics
LOGGER.info("Start building StatsCollector!");
Iterator<GenericRow> aggregatedRowsIterator = starTreeBuilder.iterator(starTreeBuilder.getTotalRawDocumentCount(), starTreeBuilder.getTotalRawDocumentCount() + starTreeBuilder.getTotalAggregateDocumentCount());
while (aggregatedRowsIterator.hasNext()) {
GenericRow genericRow = aggregatedRowsIterator.next();
statsCollector.collectRow(genericRow, true);
totalAggDocs++;
totalDocs++;
}
statsCollector.build();
buildIndexCreationInfo();
LOGGER.info("Collected stats for {} raw documents, {} aggregated documents", totalRawDocs, totalAggDocs);
long statCollectionFinishTime = System.currentTimeMillis();
// Initialize the index creation using the per-column statistics information
indexCreator.init(config, segmentIndexCreationInfo, indexCreationInfoMap, dataSchema, tempIndexDir);
//iterate over the data again,
Iterator<GenericRow> allRowsIterator = starTreeBuilder.iterator(0, starTreeBuilder.getTotalRawDocumentCount() + starTreeBuilder.getTotalAggregateDocumentCount());
while (allRowsIterator.hasNext()) {
GenericRow genericRow = allRowsIterator.next();
indexCreator.indexRow(genericRow);
}
// This is required so the dimensionsSplitOrder used by the builder can be written into the segment metadata.
if (dimensionsSplitOrder == null || dimensionsSplitOrder.isEmpty()) {
starTreeIndexSpec.setDimensionsSplitOrder(starTreeBuilder.getDimensionsSplitOrder());
}
if (skipMaterializationForDimensions == null || skipMaterializationForDimensions.isEmpty()) {
starTreeIndexSpec.setSkipMaterializationForDimensions(starTreeBuilder.getSkipMaterializationForDimensions());
}
serializeTree(starTreeBuilder, enableOffHeapFormat);
//post creation
handlePostCreation();
starTreeBuilder.cleanup();
long end = System.currentTimeMillis();
LOGGER.info("Total time:{} \n star tree build time:{} \n stat collection time:{} \n column index build time:{}", (end - start), (starTreeBuildFinishTime - start), statCollectionFinishTime - starTreeBuildFinishTime, end - statCollectionFinishTime);
}
Aggregations