Examples with StarTreeIndexSpec - com.linkedin.pinot.common.data.StarTreeIndexSpec

Example 1 with StarTreeIndexSpec

use of com.linkedin.pinot.common.data.StarTreeIndexSpec in project pinot by linkedin.

the class SegmentWithHllIndexCreateHelper method setupStarTreeConfig.

private void setupStarTreeConfig(SegmentGeneratorConfig segmentGenConfig) {
    // StarTree related
    segmentGenConfig.setEnableStarTreeIndex(true);
    StarTreeIndexSpec starTreeIndexSpec = new StarTreeIndexSpec();
    starTreeIndexSpec.setMaxLeafRecords(StarTreeIndexSpec.DEFAULT_MAX_LEAF_RECORDS);
    segmentGenConfig.setStarTreeIndexSpec(starTreeIndexSpec);
    LOGGER.info("segmentGenConfig Schema (w/o derived fields): ");
    printSchema(segmentGenConfig.getSchema());
}

Also used : StarTreeIndexSpec(com.linkedin.pinot.common.data.StarTreeIndexSpec)

Example 2 with StarTreeIndexSpec

use of com.linkedin.pinot.common.data.StarTreeIndexSpec in project pinot by linkedin.

the class SegmentIndexCreationDriverImpl method buildStarTree.

private void buildStarTree() throws Exception {
    // Create stats collector
    SegmentPreIndexStatsCollectorImpl statsCollector = new SegmentPreIndexStatsCollectorImpl(dataSchema);
    statsCollector.init();
    segmentStats = statsCollector;
    long start = System.currentTimeMillis();
    //construct star tree builder config
    StarTreeIndexSpec starTreeIndexSpec = config.getStarTreeIndexSpec();
    if (starTreeIndexSpec == null) {
        starTreeIndexSpec = new StarTreeIndexSpec();
        starTreeIndexSpec.setMaxLeafRecords(StarTreeIndexSpec.DEFAULT_MAX_LEAF_RECORDS);
        config.setStarTreeIndexSpec(starTreeIndexSpec);
    }
    List<String> dimensionsSplitOrder = starTreeIndexSpec.getDimensionsSplitOrder();
    if (dimensionsSplitOrder != null && !dimensionsSplitOrder.isEmpty()) {
        final String timeColumnName = config.getTimeColumnName();
        if (timeColumnName != null) {
            dimensionsSplitOrder.remove(timeColumnName);
        }
    }
    //create star builder config from startreeindexspec. Merge these two in one later.
    StarTreeBuilderConfig starTreeBuilderConfig = new StarTreeBuilderConfig();
    starTreeBuilderConfig.setSchema(dataSchema);
    starTreeBuilderConfig.setDimensionsSplitOrder(dimensionsSplitOrder);
    starTreeBuilderConfig.setMaxLeafRecords(starTreeIndexSpec.getMaxLeafRecords());
    starTreeBuilderConfig.setSkipStarNodeCreationForDimensions(starTreeIndexSpec.getSkipStarNodeCreationForDimensions());
    Set<String> skipMaterializationForDimensions = starTreeIndexSpec.getskipMaterializationForDimensions();
    starTreeBuilderConfig.setSkipMaterializationForDimensions(skipMaterializationForDimensions);
    starTreeBuilderConfig.setSkipMaterializationCardinalityThreshold(starTreeIndexSpec.getskipMaterializationCardinalityThreshold());
    starTreeBuilderConfig.setOutDir(starTreeTempDir);
    boolean enableOffHeapFormat = starTreeIndexSpec.isEnableOffHeapFormat();
    starTreeBuilderConfig.setEnableOffHealpFormat(enableOffHeapFormat);
    //initialize star tree builder
    StarTreeBuilder starTreeBuilder = new OffHeapStarTreeBuilder();
    starTreeBuilder.init(starTreeBuilderConfig);
    //build star tree along with collecting stats
    recordReader.rewind();
    LOGGER.info("Start append raw data to star tree builder!");
    totalDocs = 0;
    GenericRow readRow = new GenericRow();
    GenericRow transformedRow = new GenericRow();
    while (recordReader.hasNext()) {
        //PlainFieldExtractor conducts necessary type conversions
        transformedRow = readNextRowSanitized(readRow, transformedRow);
        //must be called after previous step since type conversion for derived values is unnecessary
        populateDefaultDerivedColumnValues(transformedRow);
        starTreeBuilder.append(transformedRow);
        statsCollector.collectRow(transformedRow);
        totalRawDocs++;
        totalDocs++;
    }
    recordReader.close();
    LOGGER.info("Start building star tree!");
    starTreeBuilder.build();
    LOGGER.info("Finished building star tree!");
    long starTreeBuildFinishTime = System.currentTimeMillis();
    //build stats
    // Count the number of documents and gather per-column statistics
    LOGGER.info("Start building StatsCollector!");
    Iterator<GenericRow> aggregatedRowsIterator = starTreeBuilder.iterator(starTreeBuilder.getTotalRawDocumentCount(), starTreeBuilder.getTotalRawDocumentCount() + starTreeBuilder.getTotalAggregateDocumentCount());
    while (aggregatedRowsIterator.hasNext()) {
        GenericRow genericRow = aggregatedRowsIterator.next();
        statsCollector.collectRow(genericRow, true);
        totalAggDocs++;
        totalDocs++;
    }
    statsCollector.build();
    buildIndexCreationInfo();
    LOGGER.info("Collected stats for {} raw documents, {} aggregated documents", totalRawDocs, totalAggDocs);
    long statCollectionFinishTime = System.currentTimeMillis();
    // Initialize the index creation using the per-column statistics information
    indexCreator.init(config, segmentIndexCreationInfo, indexCreationInfoMap, dataSchema, tempIndexDir);
    //iterate over the data again,
    Iterator<GenericRow> allRowsIterator = starTreeBuilder.iterator(0, starTreeBuilder.getTotalRawDocumentCount() + starTreeBuilder.getTotalAggregateDocumentCount());
    while (allRowsIterator.hasNext()) {
        GenericRow genericRow = allRowsIterator.next();
        indexCreator.indexRow(genericRow);
    }
    // This is required so the dimensionsSplitOrder used by the builder can be written into the segment metadata.
    if (dimensionsSplitOrder == null || dimensionsSplitOrder.isEmpty()) {
        starTreeIndexSpec.setDimensionsSplitOrder(starTreeBuilder.getDimensionsSplitOrder());
    }
    if (skipMaterializationForDimensions == null || skipMaterializationForDimensions.isEmpty()) {
        starTreeIndexSpec.setSkipMaterializationForDimensions(starTreeBuilder.getSkipMaterializationForDimensions());
    }
    serializeTree(starTreeBuilder, enableOffHeapFormat);
    //post creation
    handlePostCreation();
    starTreeBuilder.cleanup();
    long end = System.currentTimeMillis();
    LOGGER.info("Total time:{} \n star tree build time:{} \n stat collection time:{} \n column index build time:{}", (end - start), (starTreeBuildFinishTime - start), statCollectionFinishTime - starTreeBuildFinishTime, end - statCollectionFinishTime);
}

Also used : SegmentPreIndexStatsCollectorImpl(com.linkedin.pinot.core.segment.creator.impl.stats.SegmentPreIndexStatsCollectorImpl) GenericRow(com.linkedin.pinot.core.data.GenericRow) StarTreeIndexSpec(com.linkedin.pinot.common.data.StarTreeIndexSpec) StarTreeBuilderConfig(com.linkedin.pinot.core.startree.StarTreeBuilderConfig) OffHeapStarTreeBuilder(com.linkedin.pinot.core.startree.OffHeapStarTreeBuilder) StarTreeBuilder(com.linkedin.pinot.core.startree.StarTreeBuilder) OffHeapStarTreeBuilder(com.linkedin.pinot.core.startree.OffHeapStarTreeBuilder)

Example 3 with StarTreeIndexSpec

use of com.linkedin.pinot.common.data.StarTreeIndexSpec in project pinot by linkedin.

the class SegmentColumnarIndexCreator method writeMetadata.

void writeMetadata() throws ConfigurationException {
    PropertiesConfiguration properties = new PropertiesConfiguration(new File(file, V1Constants.MetadataKeys.METADATA_FILE_NAME));
    properties.setProperty(SEGMENT_CREATOR_VERSION, config.getCreatorVersion());
    properties.setProperty(SEGMENT_PADDING_CHARACTER, StringEscapeUtils.escapeJava(Character.toString(config.getPaddingCharacter())));
    properties.setProperty(SEGMENT_NAME, segmentName);
    properties.setProperty(TABLE_NAME, config.getTableName());
    properties.setProperty(DIMENSIONS, config.getDimensions());
    properties.setProperty(METRICS, config.getMetrics());
    properties.setProperty(TIME_COLUMN_NAME, config.getTimeColumnName());
    properties.setProperty(TIME_INTERVAL, "not_there");
    properties.setProperty(SEGMENT_TOTAL_RAW_DOCS, String.valueOf(totalRawDocs));
    properties.setProperty(SEGMENT_TOTAL_AGGREGATE_DOCS, String.valueOf(totalAggDocs));
    properties.setProperty(SEGMENT_TOTAL_DOCS, String.valueOf(totalDocs));
    properties.setProperty(STAR_TREE_ENABLED, String.valueOf(config.isEnableStarTreeIndex()));
    properties.setProperty(SEGMENT_TOTAL_ERRORS, String.valueOf(totalErrors));
    properties.setProperty(SEGMENT_TOTAL_NULLS, String.valueOf(totalNulls));
    properties.setProperty(SEGMENT_TOTAL_CONVERSIONS, String.valueOf(totalConversions));
    properties.setProperty(SEGMENT_TOTAL_NULL_COLS, String.valueOf(totalNullCols));
    StarTreeIndexSpec starTreeIndexSpec = config.getStarTreeIndexSpec();
    if (starTreeIndexSpec != null) {
        properties.setProperty(STAR_TREE_SPLIT_ORDER, starTreeIndexSpec.getDimensionsSplitOrder());
        properties.setProperty(STAR_TREE_MAX_LEAF_RECORDS, starTreeIndexSpec.getMaxLeafRecords());
        properties.setProperty(STAR_TREE_SKIP_STAR_NODE_CREATION_FOR_DIMENSIONS, starTreeIndexSpec.getSkipStarNodeCreationForDimensions());
        properties.setProperty(STAR_TREE_SKIP_MATERIALIZATION_CARDINALITY, starTreeIndexSpec.getskipMaterializationCardinalityThreshold());
        properties.setProperty(STAR_TREE_SKIP_MATERIALIZATION_FOR_DIMENSIONS, starTreeIndexSpec.getskipMaterializationForDimensions());
    }
    HllConfig hllConfig = config.getHllConfig();
    Map<String, String> derivedHllFieldToOriginMap = null;
    if (hllConfig != null) {
        properties.setProperty(SEGMENT_HLL_LOG2M, hllConfig.getHllLog2m());
        derivedHllFieldToOriginMap = hllConfig.getDerivedHllFieldToOriginMap();
    }
    String timeColumn = config.getTimeColumnName();
    if (indexCreationInfoMap.get(timeColumn) != null) {
        properties.setProperty(SEGMENT_START_TIME, indexCreationInfoMap.get(timeColumn).getMin());
        properties.setProperty(SEGMENT_END_TIME, indexCreationInfoMap.get(timeColumn).getMax());
        properties.setProperty(TIME_UNIT, config.getSegmentTimeUnit());
    }
    if (config.containsCustomProperty(SEGMENT_START_TIME)) {
        properties.setProperty(SEGMENT_START_TIME, config.getStartTime());
    }
    if (config.containsCustomProperty(SEGMENT_END_TIME)) {
        properties.setProperty(SEGMENT_END_TIME, config.getEndTime());
    }
    if (config.containsCustomProperty(TIME_UNIT)) {
        properties.setProperty(TIME_UNIT, config.getSegmentTimeUnit());
    }
    for (Map.Entry<String, String> entry : config.getCustomProperties().entrySet()) {
        properties.setProperty(entry.getKey(), entry.getValue());
    }
    for (Map.Entry<String, ColumnIndexCreationInfo> entry : indexCreationInfoMap.entrySet()) {
        String column = entry.getKey();
        ColumnIndexCreationInfo columnIndexCreationInfo = entry.getValue();
        SegmentDictionaryCreator dictionaryCreator = dictionaryCreatorMap.get(column);
        int dictionaryElementSize = (dictionaryCreator != null) ? dictionaryCreator.getStringColumnMaxLength() : 0;
        // TODO: after fixing the server-side dependency on HAS_INVERTED_INDEX and deployed, set HAS_INVERTED_INDEX properly
        // The hasInvertedIndex flag in segment metadata is picked up in ColumnMetadata, and will be used during the query
        // plan phase. If it is set to false, then inverted indexes are not used in queries even if they are created via table
        // configs on segment load. So, we set it to true here for now, until we fix the server to update the value inside
        // ColumnMetadata, export information to the query planner that the inverted index available is current and can be used.
        //
        //    boolean hasInvertedIndex = invertedIndexCreatorMap.containsKey();
        boolean hasInvertedIndex = true;
        String hllOriginColumn = null;
        if (derivedHllFieldToOriginMap != null) {
            hllOriginColumn = derivedHllFieldToOriginMap.get(column);
        }
        addColumnMetadataInfo(properties, column, columnIndexCreationInfo, totalDocs, totalRawDocs, totalAggDocs, schema.getFieldSpecFor(column), dictionaryCreatorMap.containsKey(column), dictionaryElementSize, hasInvertedIndex, hllOriginColumn);
    }
    properties.save();
}

Also used : HllConfig(com.linkedin.pinot.core.startree.hll.HllConfig) PropertiesConfiguration(org.apache.commons.configuration.PropertiesConfiguration) StarTreeIndexSpec(com.linkedin.pinot.common.data.StarTreeIndexSpec) ColumnIndexCreationInfo(com.linkedin.pinot.core.segment.creator.ColumnIndexCreationInfo) File(java.io.File) HashMap(java.util.HashMap) Map(java.util.Map)

Example 4 with StarTreeIndexSpec

use of com.linkedin.pinot.common.data.StarTreeIndexSpec in project pinot by linkedin.

the class BaseClusterIntegrationTest method buildSegmentsFromAvro.

public static Future<Map<File, File>> buildSegmentsFromAvro(final List<File> avroFiles, Executor executor, int baseSegmentIndex, final File baseDirectory, final File segmentTarDir, final String tableName, final boolean createStarTreeIndex, final com.linkedin.pinot.common.data.Schema inputPinotSchema) {
    int segmentCount = avroFiles.size();
    LOGGER.info("Building " + segmentCount + " segments in parallel");
    List<ListenableFutureTask<Pair<File, File>>> futureTasks = new ArrayList<ListenableFutureTask<Pair<File, File>>>();
    for (int i = 1; i <= segmentCount; ++i) {
        final int segmentIndex = i - 1;
        final int segmentNumber = i + baseSegmentIndex;
        final ListenableFutureTask<Pair<File, File>> buildSegmentFutureTask = ListenableFutureTask.<Pair<File, File>>create(new Callable<Pair<File, File>>() {

            @Override
            public Pair<File, File> call() throws Exception {
                try {
                    // Build segment
                    LOGGER.info("Starting to build segment " + segmentNumber);
                    File outputDir = new File(baseDirectory, "segment-" + segmentNumber);
                    final File inputAvroFile = avroFiles.get(segmentIndex);
                    final SegmentGeneratorConfig genConfig = SegmentTestUtils.getSegmentGenSpecWithSchemAndProjectedColumns(inputAvroFile, outputDir, TimeUnit.DAYS, tableName, inputPinotSchema);
                    if (inputPinotSchema != null) {
                        genConfig.setSchema(inputPinotSchema);
                    }
                    // jfim: We add a space and a special character to do a regression test for PINOT-3296 Segments with spaces
                    // in their filename don't work properly
                    genConfig.setSegmentNamePostfix(Integer.toString(segmentNumber) + " %");
                    genConfig.setEnableStarTreeIndex(createStarTreeIndex);
                    // Enable off heap star tree format in the integration test.
                    StarTreeIndexSpec starTreeIndexSpec = null;
                    if (createStarTreeIndex) {
                        starTreeIndexSpec = new StarTreeIndexSpec();
                        starTreeIndexSpec.setEnableOffHeapFormat(true);
                    }
                    genConfig.setStarTreeIndexSpec(starTreeIndexSpec);
                    final SegmentIndexCreationDriver driver = SegmentCreationDriverFactory.get(null);
                    driver.init(genConfig);
                    driver.build();
                    // Tar segment
                    String segmentName = outputDir.list()[0];
                    final String tarGzPath = TarGzCompressionUtils.createTarGzOfDirectory(outputDir.getAbsolutePath() + "/" + segmentName, new File(segmentTarDir, segmentName).getAbsolutePath());
                    LOGGER.info("Completed segment " + segmentNumber + " : " + segmentName + " from file " + inputAvroFile.getName());
                    return new ImmutablePair<File, File>(inputAvroFile, new File(tarGzPath));
                } catch (Exception e) {
                    LOGGER.error("Exception while building segment input: {} output {} ", avroFiles.get(segmentIndex), "segment-" + segmentNumber);
                    throw new RuntimeException(e);
                }
            }
        });
        futureTasks.add(buildSegmentFutureTask);
        executor.execute(buildSegmentFutureTask);
    }
    ListenableFuture<List<Pair<File, File>>> pairListFuture = Futures.allAsList(futureTasks);
    return Futures.transform(pairListFuture, new AsyncFunction<List<Pair<File, File>>, Map<File, File>>() {

        @Override
        public ListenableFuture<Map<File, File>> apply(List<Pair<File, File>> input) throws Exception {
            Map<File, File> avroToSegmentMap = new HashMap<File, File>();
            for (Pair<File, File> avroToSegmentPair : input) {
                avroToSegmentMap.put(avroToSegmentPair.getLeft(), avroToSegmentPair.getRight());
            }
            return Futures.immediateFuture(avroToSegmentMap);
        }
    });
}

Also used : SegmentIndexCreationDriver(com.linkedin.pinot.core.segment.creator.SegmentIndexCreationDriver) ArrayList(java.util.ArrayList) StarTreeIndexSpec(com.linkedin.pinot.common.data.StarTreeIndexSpec) JSONException(org.json.JSONException) ArchiveException(org.apache.commons.compress.archivers.ArchiveException) SQLException(java.sql.SQLException) IOException(java.io.IOException) ListenableFutureTask(com.google.common.util.concurrent.ListenableFutureTask) SegmentGeneratorConfig(com.linkedin.pinot.core.indexsegment.generator.SegmentGeneratorConfig) ListenableFuture(com.google.common.util.concurrent.ListenableFuture) List(java.util.List) ArrayList(java.util.ArrayList) File(java.io.File) Map(java.util.Map) HashMap(java.util.HashMap) Pair(org.apache.commons.lang3.tuple.Pair) ImmutablePair(org.apache.commons.lang3.tuple.ImmutablePair)

Example 5 with StarTreeIndexSpec

use of com.linkedin.pinot.common.data.StarTreeIndexSpec in project pinot by linkedin.

the class TestStarTreeMetadata method setupSegment.

private void setupSegment(File segmentDir) throws Exception {
    final String filePath = TestUtils.getFileFromResourceUrl(getClass().getClassLoader().getResource(AVRO_DATA));
    if (segmentDir.exists()) {
        FileUtils.deleteQuietly(segmentDir);
    }
    final SegmentGeneratorConfig config = SegmentTestUtils.getSegmentGenSpecWithSchemAndProjectedColumns(new File(filePath), segmentDir, "time_day", TimeUnit.DAYS, TABLE_NAME);
    config.setTableName(TABLE_NAME);
    config.setSegmentName(SEGMENT_NAME);
    StarTreeIndexSpec starTreeIndexSpec = new StarTreeIndexSpec();
    starTreeIndexSpec.setDimensionsSplitOrder(DIMENSIONS_SPLIT_ORDER);
    starTreeIndexSpec.setMaxLeafRecords(MAX_LEAF_RECORDS);
    starTreeIndexSpec.setSkipMaterializationCardinalityThreshold(SKIP_CARDINALITY_THRESHOLD);
    starTreeIndexSpec.setSkipStarNodeCreationForDimensions(SKIP_STAR_NODE_CREATION_DIMENSTIONS);
    starTreeIndexSpec.setSkipMaterializationForDimensions(SKIP_MATERIALIZATION_DIMENSIONS);
    config.setEnableStarTreeIndex(true);
    config.setStarTreeIndexSpec(starTreeIndexSpec);
    final SegmentIndexCreationDriver driver = SegmentCreationDriverFactory.get(null);
    driver.init(config);
    driver.build();
}

Also used : SegmentIndexCreationDriver(com.linkedin.pinot.core.segment.creator.SegmentIndexCreationDriver) SegmentGeneratorConfig(com.linkedin.pinot.core.indexsegment.generator.SegmentGeneratorConfig) File(java.io.File) StarTreeIndexSpec(com.linkedin.pinot.common.data.StarTreeIndexSpec)

Aggregations

StarTreeIndexSpec (com.linkedin.pinot.common.data.StarTreeIndexSpec)5 File (java.io.File)3 SegmentGeneratorConfig (com.linkedin.pinot.core.indexsegment.generator.SegmentGeneratorConfig)2 SegmentIndexCreationDriver (com.linkedin.pinot.core.segment.creator.SegmentIndexCreationDriver)2 HashMap (java.util.HashMap)2 Map (java.util.Map)2 ListenableFuture (com.google.common.util.concurrent.ListenableFuture)1 ListenableFutureTask (com.google.common.util.concurrent.ListenableFutureTask)1 GenericRow (com.linkedin.pinot.core.data.GenericRow)1 ColumnIndexCreationInfo (com.linkedin.pinot.core.segment.creator.ColumnIndexCreationInfo)1 SegmentPreIndexStatsCollectorImpl (com.linkedin.pinot.core.segment.creator.impl.stats.SegmentPreIndexStatsCollectorImpl)1 OffHeapStarTreeBuilder (com.linkedin.pinot.core.startree.OffHeapStarTreeBuilder)1 StarTreeBuilder (com.linkedin.pinot.core.startree.StarTreeBuilder)1 StarTreeBuilderConfig (com.linkedin.pinot.core.startree.StarTreeBuilderConfig)1 HllConfig (com.linkedin.pinot.core.startree.hll.HllConfig)1 IOException (java.io.IOException)1 SQLException (java.sql.SQLException)1 ArrayList (java.util.ArrayList)1 List (java.util.List)1 ArchiveException (org.apache.commons.compress.archivers.ArchiveException)1