Search in sources :

Example 1 with StarTreeBuilderConfig

use of com.linkedin.pinot.core.startree.StarTreeBuilderConfig in project pinot by linkedin.

the class OffheapStarTreeBuilderWithHllFieldTest method testSimpleCore.

private void testSimpleCore(int numDimensions, int numMetrics, int numSkipMaterializationDimensions, int[] memberIdColumnValues, long preciseCardinality) throws Exception {
    StarTreeBuilderConfig builderConfig = null;
    try {
        builderConfig = new StarTreeBuilderConfig();
        Schema schema = new Schema();
        builderConfig.dimensionsSplitOrder = new ArrayList<>();
        builderConfig.setSkipMaterializationForDimensions(new HashSet<String>());
        Set<String> skipMaterializationForDimensions = builderConfig.getSkipMaterializationForDimensions();
        // add member id dimension spec
        String dimName = memberIdFieldName;
        DimensionFieldSpec dimensionFieldSpec = new DimensionFieldSpec(dimName, DataType.INT, true);
        schema.addField(dimensionFieldSpec);
        // add other dimension specs
        for (int i = 1; i < numDimensions; i++) {
            dimName = "d" + (i + 1);
            dimensionFieldSpec = new DimensionFieldSpec(dimName, DataType.STRING, true);
            schema.addField(dimensionFieldSpec);
            if (i < (numDimensions - numSkipMaterializationDimensions)) {
                builderConfig.dimensionsSplitOrder.add(dimName);
            } else {
                builderConfig.getSkipMaterializationForDimensions().add(dimName);
            }
        }
        schema.setTimeFieldSpec(new TimeFieldSpec("daysSinceEpoch", DataType.INT, TimeUnit.DAYS));
        // add other metric specs
        for (int i = 0; i < numMetrics - 1; i++) {
            String metricName = "m" + (i + 1);
            MetricFieldSpec metricFieldSpec = new MetricFieldSpec(metricName, DataType.INT);
            schema.addField(metricFieldSpec);
        }
        // add hll metric
        String hllMetricName = memberIdFieldName + hllDeriveFieldSuffix;
        MetricFieldSpec hllDerivedFieldSpec = new MetricFieldSpec(hllMetricName, FieldSpec.DataType.STRING, HllUtil.getHllFieldSizeFromLog2m(log2m), MetricFieldSpec.DerivedMetricType.HLL);
        schema.addField(hllDerivedFieldSpec);
        //
        builderConfig.maxLeafRecords = 10;
        builderConfig.schema = schema;
        builderConfig.setOutDir(new File("/tmp/startree"));
        //
        OffHeapStarTreeBuilder builder = new OffHeapStarTreeBuilder();
        builder.init(builderConfig);
        // fill values
        HashMap<String, Object> map = new HashMap<>();
        for (int row = 0; row < memberIdColumnValues.length; row++) {
            // add member id column
            dimName = memberIdFieldName;
            map.put(dimName, memberIdColumnValues[row]);
            // add other dimensions
            for (int i = 1; i < numDimensions; i++) {
                dimName = schema.getDimensionFieldSpecs().get(i).getName();
                map.put(dimName, dimName + "-v" + row % (numDimensions - i));
            }
            // add time column
            map.put("daysSinceEpoch", 1);
            // add other metrics
            for (int i = 0; i < numMetrics - 1; i++) {
                String metName = schema.getMetricFieldSpecs().get(i).getName();
                map.put(metName, 1);
            }
            // add hll column value
            map.put(hllMetricName, HllUtil.singleValueHllAsString(log2m, memberIdColumnValues[row]));
            //
            GenericRow genericRow = new GenericRow();
            genericRow.init(map);
            builder.append(genericRow);
        }
        builder.build();
        int totalDocs = builder.getTotalRawDocumentCount() + builder.getTotalAggregateDocumentCount();
        Iterator<GenericRow> iterator = builder.iterator(0, totalDocs);
        while (iterator.hasNext()) {
            GenericRow row = iterator.next();
            LOGGER.info(HllUtil.inspectGenericRow(row, hllDeriveFieldSuffix));
        }
        iterator = builder.iterator(builder.getTotalRawDocumentCount(), totalDocs);
        GenericRow lastRow = null;
        while (iterator.hasNext()) {
            GenericRow row = iterator.next();
            for (String skipDimension : skipMaterializationForDimensions) {
                String rowValue = (String) row.getValue(skipDimension);
                assert (rowValue.equals("ALL"));
            }
            lastRow = row;
        }
        assertApproximation(HllUtil.convertStringToHll((String) lastRow.getValue(hllMetricName)).cardinality(), preciseCardinality, 0.1);
    } finally {
        if (builderConfig != null) {
            FileUtils.deleteDirectory(builderConfig.getOutDir());
        }
    }
}
Also used : GenericRow(com.linkedin.pinot.core.data.GenericRow) File(java.io.File) StarTreeBuilderConfig(com.linkedin.pinot.core.startree.StarTreeBuilderConfig) OffHeapStarTreeBuilder(com.linkedin.pinot.core.startree.OffHeapStarTreeBuilder)

Example 2 with StarTreeBuilderConfig

use of com.linkedin.pinot.core.startree.StarTreeBuilderConfig in project pinot by linkedin.

the class SegmentIndexCreationDriverImpl method buildStarTree.

private void buildStarTree() throws Exception {
    // Create stats collector
    SegmentPreIndexStatsCollectorImpl statsCollector = new SegmentPreIndexStatsCollectorImpl(dataSchema);
    statsCollector.init();
    segmentStats = statsCollector;
    long start = System.currentTimeMillis();
    //construct star tree builder config
    StarTreeIndexSpec starTreeIndexSpec = config.getStarTreeIndexSpec();
    if (starTreeIndexSpec == null) {
        starTreeIndexSpec = new StarTreeIndexSpec();
        starTreeIndexSpec.setMaxLeafRecords(StarTreeIndexSpec.DEFAULT_MAX_LEAF_RECORDS);
        config.setStarTreeIndexSpec(starTreeIndexSpec);
    }
    List<String> dimensionsSplitOrder = starTreeIndexSpec.getDimensionsSplitOrder();
    if (dimensionsSplitOrder != null && !dimensionsSplitOrder.isEmpty()) {
        final String timeColumnName = config.getTimeColumnName();
        if (timeColumnName != null) {
            dimensionsSplitOrder.remove(timeColumnName);
        }
    }
    //create star builder config from startreeindexspec. Merge these two in one later.
    StarTreeBuilderConfig starTreeBuilderConfig = new StarTreeBuilderConfig();
    starTreeBuilderConfig.setSchema(dataSchema);
    starTreeBuilderConfig.setDimensionsSplitOrder(dimensionsSplitOrder);
    starTreeBuilderConfig.setMaxLeafRecords(starTreeIndexSpec.getMaxLeafRecords());
    starTreeBuilderConfig.setSkipStarNodeCreationForDimensions(starTreeIndexSpec.getSkipStarNodeCreationForDimensions());
    Set<String> skipMaterializationForDimensions = starTreeIndexSpec.getskipMaterializationForDimensions();
    starTreeBuilderConfig.setSkipMaterializationForDimensions(skipMaterializationForDimensions);
    starTreeBuilderConfig.setSkipMaterializationCardinalityThreshold(starTreeIndexSpec.getskipMaterializationCardinalityThreshold());
    starTreeBuilderConfig.setOutDir(starTreeTempDir);
    boolean enableOffHeapFormat = starTreeIndexSpec.isEnableOffHeapFormat();
    starTreeBuilderConfig.setEnableOffHealpFormat(enableOffHeapFormat);
    //initialize star tree builder
    StarTreeBuilder starTreeBuilder = new OffHeapStarTreeBuilder();
    starTreeBuilder.init(starTreeBuilderConfig);
    //build star tree along with collecting stats
    recordReader.rewind();
    LOGGER.info("Start append raw data to star tree builder!");
    totalDocs = 0;
    GenericRow readRow = new GenericRow();
    GenericRow transformedRow = new GenericRow();
    while (recordReader.hasNext()) {
        //PlainFieldExtractor conducts necessary type conversions
        transformedRow = readNextRowSanitized(readRow, transformedRow);
        //must be called after previous step since type conversion for derived values is unnecessary
        populateDefaultDerivedColumnValues(transformedRow);
        starTreeBuilder.append(transformedRow);
        statsCollector.collectRow(transformedRow);
        totalRawDocs++;
        totalDocs++;
    }
    recordReader.close();
    LOGGER.info("Start building star tree!");
    starTreeBuilder.build();
    LOGGER.info("Finished building star tree!");
    long starTreeBuildFinishTime = System.currentTimeMillis();
    //build stats
    // Count the number of documents and gather per-column statistics
    LOGGER.info("Start building StatsCollector!");
    Iterator<GenericRow> aggregatedRowsIterator = starTreeBuilder.iterator(starTreeBuilder.getTotalRawDocumentCount(), starTreeBuilder.getTotalRawDocumentCount() + starTreeBuilder.getTotalAggregateDocumentCount());
    while (aggregatedRowsIterator.hasNext()) {
        GenericRow genericRow = aggregatedRowsIterator.next();
        statsCollector.collectRow(genericRow, true);
        totalAggDocs++;
        totalDocs++;
    }
    statsCollector.build();
    buildIndexCreationInfo();
    LOGGER.info("Collected stats for {} raw documents, {} aggregated documents", totalRawDocs, totalAggDocs);
    long statCollectionFinishTime = System.currentTimeMillis();
    // Initialize the index creation using the per-column statistics information
    indexCreator.init(config, segmentIndexCreationInfo, indexCreationInfoMap, dataSchema, tempIndexDir);
    //iterate over the data again,
    Iterator<GenericRow> allRowsIterator = starTreeBuilder.iterator(0, starTreeBuilder.getTotalRawDocumentCount() + starTreeBuilder.getTotalAggregateDocumentCount());
    while (allRowsIterator.hasNext()) {
        GenericRow genericRow = allRowsIterator.next();
        indexCreator.indexRow(genericRow);
    }
    // This is required so the dimensionsSplitOrder used by the builder can be written into the segment metadata.
    if (dimensionsSplitOrder == null || dimensionsSplitOrder.isEmpty()) {
        starTreeIndexSpec.setDimensionsSplitOrder(starTreeBuilder.getDimensionsSplitOrder());
    }
    if (skipMaterializationForDimensions == null || skipMaterializationForDimensions.isEmpty()) {
        starTreeIndexSpec.setSkipMaterializationForDimensions(starTreeBuilder.getSkipMaterializationForDimensions());
    }
    serializeTree(starTreeBuilder, enableOffHeapFormat);
    //post creation
    handlePostCreation();
    starTreeBuilder.cleanup();
    long end = System.currentTimeMillis();
    LOGGER.info("Total time:{} \n star tree build time:{} \n stat collection time:{} \n column index build time:{}", (end - start), (starTreeBuildFinishTime - start), statCollectionFinishTime - starTreeBuildFinishTime, end - statCollectionFinishTime);
}
Also used : SegmentPreIndexStatsCollectorImpl(com.linkedin.pinot.core.segment.creator.impl.stats.SegmentPreIndexStatsCollectorImpl) GenericRow(com.linkedin.pinot.core.data.GenericRow) StarTreeIndexSpec(com.linkedin.pinot.common.data.StarTreeIndexSpec) StarTreeBuilderConfig(com.linkedin.pinot.core.startree.StarTreeBuilderConfig) OffHeapStarTreeBuilder(com.linkedin.pinot.core.startree.OffHeapStarTreeBuilder) StarTreeBuilder(com.linkedin.pinot.core.startree.StarTreeBuilder) OffHeapStarTreeBuilder(com.linkedin.pinot.core.startree.OffHeapStarTreeBuilder)

Aggregations

GenericRow (com.linkedin.pinot.core.data.GenericRow)2 OffHeapStarTreeBuilder (com.linkedin.pinot.core.startree.OffHeapStarTreeBuilder)2 StarTreeBuilderConfig (com.linkedin.pinot.core.startree.StarTreeBuilderConfig)2 StarTreeIndexSpec (com.linkedin.pinot.common.data.StarTreeIndexSpec)1 SegmentPreIndexStatsCollectorImpl (com.linkedin.pinot.core.segment.creator.impl.stats.SegmentPreIndexStatsCollectorImpl)1 StarTreeBuilder (com.linkedin.pinot.core.startree.StarTreeBuilder)1 File (java.io.File)1