Search in sources :

Example 31 with GenericRow

use of com.linkedin.pinot.core.data.GenericRow in project pinot by linkedin.

the class TransformGroupByTest method buildSegment.

/**
   * Helper method to build a segment with one dimension column containing values
   * from {@link #_dimensionValues}, and one metric column.
   *
   * Also builds the expected group by result as it builds the segments.
   *
   * @param segmentDirName Name of segment directory
   * @param segmentName Name of segment
   * @param schema Schema for segment
   * @return Schema built for the segment
   * @throws Exception
   */
private RecordReader buildSegment(String segmentDirName, String segmentName, Schema schema) throws Exception {
    SegmentGeneratorConfig config = new SegmentGeneratorConfig(schema);
    config.setOutDir(segmentDirName);
    config.setFormat(FileFormat.AVRO);
    config.setTableName(TABLE_NAME);
    config.setSegmentName(segmentName);
    Random random = new Random(RANDOM_SEED);
    long currentTimeMillis = System.currentTimeMillis();
    // Divide the day into fixed parts, and decrement time column value by this delta, so as to get
    // continuous days in the input. This gives about 10 days per 10k rows.
    long timeDelta = TimeUnit.MILLISECONDS.convert(1, TimeUnit.DAYS) / 1000;
    final List<GenericRow> data = new ArrayList<>();
    int numDimValues = _dimensionValues.length;
    for (int row = 0; row < NUM_ROWS; row++) {
        HashMap<String, Object> map = new HashMap<>();
        map.put(DIMENSION_NAME, _dimensionValues[random.nextInt(numDimValues)]);
        map.put(METRIC_NAME, random.nextDouble());
        map.put(TIME_COLUMN_NAME, currentTimeMillis);
        currentTimeMillis -= timeDelta;
        GenericRow genericRow = new GenericRow();
        genericRow.init(map);
        data.add(genericRow);
    }
    SegmentIndexCreationDriverImpl driver = new SegmentIndexCreationDriverImpl();
    RecordReader reader = new TestUtils.GenericRowRecordReader(schema, data);
    driver.init(config, reader);
    driver.build();
    LOGGER.info("Built segment {} at {}", segmentName, segmentDirName);
    return reader;
}
Also used : HashMap(java.util.HashMap) RecordReader(com.linkedin.pinot.core.data.readers.RecordReader) ArrayList(java.util.ArrayList) SegmentIndexCreationDriverImpl(com.linkedin.pinot.core.segment.creator.impl.SegmentIndexCreationDriverImpl) GenericRow(com.linkedin.pinot.core.data.GenericRow) Random(java.util.Random) SegmentGeneratorConfig(com.linkedin.pinot.core.indexsegment.generator.SegmentGeneratorConfig)

Example 32 with GenericRow

use of com.linkedin.pinot.core.data.GenericRow in project pinot by linkedin.

the class TransformExpressionOperatorTest method buildSegment.

/**
   * Helper method to build a segment with {@link #NUM_METRICS} metrics with random
   * data as per the schema.
   *
   * @param segmentDirName Name of segment directory
   * @param segmentName Name of segment
   * @param schema Schema for segment
   * @return Schema built for the segment
   * @throws Exception
   */
private Schema buildSegment(String segmentDirName, String segmentName, Schema schema) throws Exception {
    SegmentGeneratorConfig config = new SegmentGeneratorConfig(schema);
    config.setOutDir(segmentDirName);
    config.setFormat(FileFormat.AVRO);
    config.setSegmentName(segmentName);
    Random random = new Random(RANDOM_SEED);
    final List<GenericRow> data = new ArrayList<>();
    _values = new double[NUM_ROWS][NUM_METRICS];
    for (int row = 0; row < NUM_ROWS; row++) {
        HashMap<String, Object> map = new HashMap<>();
        // Metric columns.
        for (int i = 0; i < NUM_METRICS; i++) {
            String metName = schema.getMetricFieldSpecs().get(i).getName();
            double value = random.nextInt(MAX_METRIC_VALUE) + random.nextDouble() + 1.0;
            map.put(metName, value);
            _values[row][i] = value;
        }
        GenericRow genericRow = new GenericRow();
        genericRow.init(map);
        data.add(genericRow);
    }
    SegmentIndexCreationDriverImpl driver = new SegmentIndexCreationDriverImpl();
    RecordReader reader = new TestUtils.GenericRowRecordReader(schema, data);
    driver.init(config, reader);
    driver.build();
    LOGGER.info("Built segment {} at {}", segmentName, segmentDirName);
    return schema;
}
Also used : HashMap(java.util.HashMap) RecordReader(com.linkedin.pinot.core.data.readers.RecordReader) ArrayList(java.util.ArrayList) SegmentIndexCreationDriverImpl(com.linkedin.pinot.core.segment.creator.impl.SegmentIndexCreationDriverImpl) GenericRow(com.linkedin.pinot.core.data.GenericRow) Random(java.util.Random) SegmentGeneratorConfig(com.linkedin.pinot.core.indexsegment.generator.SegmentGeneratorConfig)

Example 33 with GenericRow

use of com.linkedin.pinot.core.data.GenericRow in project pinot by linkedin.

the class SegmentIndexCreationDriverImpl method buildStarTree.

private void buildStarTree() throws Exception {
    // Create stats collector
    SegmentPreIndexStatsCollectorImpl statsCollector = new SegmentPreIndexStatsCollectorImpl(dataSchema);
    statsCollector.init();
    segmentStats = statsCollector;
    long start = System.currentTimeMillis();
    //construct star tree builder config
    StarTreeIndexSpec starTreeIndexSpec = config.getStarTreeIndexSpec();
    if (starTreeIndexSpec == null) {
        starTreeIndexSpec = new StarTreeIndexSpec();
        starTreeIndexSpec.setMaxLeafRecords(StarTreeIndexSpec.DEFAULT_MAX_LEAF_RECORDS);
        config.setStarTreeIndexSpec(starTreeIndexSpec);
    }
    List<String> dimensionsSplitOrder = starTreeIndexSpec.getDimensionsSplitOrder();
    if (dimensionsSplitOrder != null && !dimensionsSplitOrder.isEmpty()) {
        final String timeColumnName = config.getTimeColumnName();
        if (timeColumnName != null) {
            dimensionsSplitOrder.remove(timeColumnName);
        }
    }
    //create star builder config from startreeindexspec. Merge these two in one later.
    StarTreeBuilderConfig starTreeBuilderConfig = new StarTreeBuilderConfig();
    starTreeBuilderConfig.setSchema(dataSchema);
    starTreeBuilderConfig.setDimensionsSplitOrder(dimensionsSplitOrder);
    starTreeBuilderConfig.setMaxLeafRecords(starTreeIndexSpec.getMaxLeafRecords());
    starTreeBuilderConfig.setSkipStarNodeCreationForDimensions(starTreeIndexSpec.getSkipStarNodeCreationForDimensions());
    Set<String> skipMaterializationForDimensions = starTreeIndexSpec.getskipMaterializationForDimensions();
    starTreeBuilderConfig.setSkipMaterializationForDimensions(skipMaterializationForDimensions);
    starTreeBuilderConfig.setSkipMaterializationCardinalityThreshold(starTreeIndexSpec.getskipMaterializationCardinalityThreshold());
    starTreeBuilderConfig.setOutDir(starTreeTempDir);
    boolean enableOffHeapFormat = starTreeIndexSpec.isEnableOffHeapFormat();
    starTreeBuilderConfig.setEnableOffHealpFormat(enableOffHeapFormat);
    //initialize star tree builder
    StarTreeBuilder starTreeBuilder = new OffHeapStarTreeBuilder();
    starTreeBuilder.init(starTreeBuilderConfig);
    //build star tree along with collecting stats
    recordReader.rewind();
    LOGGER.info("Start append raw data to star tree builder!");
    totalDocs = 0;
    GenericRow readRow = new GenericRow();
    GenericRow transformedRow = new GenericRow();
    while (recordReader.hasNext()) {
        //PlainFieldExtractor conducts necessary type conversions
        transformedRow = readNextRowSanitized(readRow, transformedRow);
        //must be called after previous step since type conversion for derived values is unnecessary
        populateDefaultDerivedColumnValues(transformedRow);
        starTreeBuilder.append(transformedRow);
        statsCollector.collectRow(transformedRow);
        totalRawDocs++;
        totalDocs++;
    }
    recordReader.close();
    LOGGER.info("Start building star tree!");
    starTreeBuilder.build();
    LOGGER.info("Finished building star tree!");
    long starTreeBuildFinishTime = System.currentTimeMillis();
    //build stats
    // Count the number of documents and gather per-column statistics
    LOGGER.info("Start building StatsCollector!");
    Iterator<GenericRow> aggregatedRowsIterator = starTreeBuilder.iterator(starTreeBuilder.getTotalRawDocumentCount(), starTreeBuilder.getTotalRawDocumentCount() + starTreeBuilder.getTotalAggregateDocumentCount());
    while (aggregatedRowsIterator.hasNext()) {
        GenericRow genericRow = aggregatedRowsIterator.next();
        statsCollector.collectRow(genericRow, true);
        totalAggDocs++;
        totalDocs++;
    }
    statsCollector.build();
    buildIndexCreationInfo();
    LOGGER.info("Collected stats for {} raw documents, {} aggregated documents", totalRawDocs, totalAggDocs);
    long statCollectionFinishTime = System.currentTimeMillis();
    // Initialize the index creation using the per-column statistics information
    indexCreator.init(config, segmentIndexCreationInfo, indexCreationInfoMap, dataSchema, tempIndexDir);
    //iterate over the data again,
    Iterator<GenericRow> allRowsIterator = starTreeBuilder.iterator(0, starTreeBuilder.getTotalRawDocumentCount() + starTreeBuilder.getTotalAggregateDocumentCount());
    while (allRowsIterator.hasNext()) {
        GenericRow genericRow = allRowsIterator.next();
        indexCreator.indexRow(genericRow);
    }
    // This is required so the dimensionsSplitOrder used by the builder can be written into the segment metadata.
    if (dimensionsSplitOrder == null || dimensionsSplitOrder.isEmpty()) {
        starTreeIndexSpec.setDimensionsSplitOrder(starTreeBuilder.getDimensionsSplitOrder());
    }
    if (skipMaterializationForDimensions == null || skipMaterializationForDimensions.isEmpty()) {
        starTreeIndexSpec.setSkipMaterializationForDimensions(starTreeBuilder.getSkipMaterializationForDimensions());
    }
    serializeTree(starTreeBuilder, enableOffHeapFormat);
    //post creation
    handlePostCreation();
    starTreeBuilder.cleanup();
    long end = System.currentTimeMillis();
    LOGGER.info("Total time:{} \n star tree build time:{} \n stat collection time:{} \n column index build time:{}", (end - start), (starTreeBuildFinishTime - start), statCollectionFinishTime - starTreeBuildFinishTime, end - statCollectionFinishTime);
}
Also used : SegmentPreIndexStatsCollectorImpl(com.linkedin.pinot.core.segment.creator.impl.stats.SegmentPreIndexStatsCollectorImpl) GenericRow(com.linkedin.pinot.core.data.GenericRow) StarTreeIndexSpec(com.linkedin.pinot.common.data.StarTreeIndexSpec) StarTreeBuilderConfig(com.linkedin.pinot.core.startree.StarTreeBuilderConfig) OffHeapStarTreeBuilder(com.linkedin.pinot.core.startree.OffHeapStarTreeBuilder) StarTreeBuilder(com.linkedin.pinot.core.startree.StarTreeBuilder) OffHeapStarTreeBuilder(com.linkedin.pinot.core.startree.OffHeapStarTreeBuilder)

Example 34 with GenericRow

use of com.linkedin.pinot.core.data.GenericRow in project pinot by linkedin.

the class IndexSegmentImpl method iterator.

public Iterator<GenericRow> iterator(final int startDocId, final int endDocId) {
    final Map<String, BlockSingleValIterator> singleValIteratorMap = new HashMap<>();
    final Map<String, BlockMultiValIterator> multiValIteratorMap = new HashMap<>();
    for (String column : getColumnNames()) {
        DataSource dataSource = getDataSource(column);
        BlockValIterator iterator = dataSource.getNextBlock().getBlockValueSet().iterator();
        if (dataSource.getDataSourceMetadata().isSingleValue()) {
            singleValIteratorMap.put(column, (BlockSingleValIterator) iterator);
        } else {
            multiValIteratorMap.put(column, (BlockMultiValIterator) iterator);
        }
    }
    return new Iterator<GenericRow>() {

        int docId = startDocId;

        @Override
        public boolean hasNext() {
            return docId < endDocId;
        }

        @Override
        public void remove() {
            throw new UnsupportedOperationException();
        }

        @Override
        public GenericRow next() {
            Map<String, Object> map = new HashMap<>();
            for (String column : singleValIteratorMap.keySet()) {
                int dictId = singleValIteratorMap.get(column).nextIntVal();
                Dictionary dictionary = getDictionaryFor(column);
                map.put(column, dictionary.get(dictId));
            }
            for (String column : multiValIteratorMap.keySet()) {
            //TODO:handle multi value
            }
            GenericRow genericRow = new GenericRow();
            genericRow.init(map);
            docId++;
            return genericRow;
        }
    };
}
Also used : Dictionary(com.linkedin.pinot.core.segment.index.readers.Dictionary) HashMap(java.util.HashMap) DataSource(com.linkedin.pinot.core.common.DataSource) GenericRow(com.linkedin.pinot.core.data.GenericRow) BlockMultiValIterator(com.linkedin.pinot.core.common.BlockMultiValIterator) BlockSingleValIterator(com.linkedin.pinot.core.common.BlockSingleValIterator) BlockSingleValIterator(com.linkedin.pinot.core.common.BlockSingleValIterator) Iterator(java.util.Iterator) BlockMultiValIterator(com.linkedin.pinot.core.common.BlockMultiValIterator) BlockValIterator(com.linkedin.pinot.core.common.BlockValIterator) BlockValIterator(com.linkedin.pinot.core.common.BlockValIterator)

Example 35 with GenericRow

use of com.linkedin.pinot.core.data.GenericRow in project pinot by linkedin.

the class RecordReaderSegmentCreationDataSource method gatherStats.

@Override
public SegmentPreIndexStatsCollector gatherStats(FieldExtractor fieldExtractor) {
    try {
        SegmentPreIndexStatsCollector collector = new SegmentPreIndexStatsCollectorImpl(_recordReader.getSchema());
        collector.init();
        // Gather the stats
        GenericRow readRow = new GenericRow();
        GenericRow transformedRow = new GenericRow();
        while (_recordReader.hasNext()) {
            transformedRow = readNextRowSanitized(readRow, transformedRow, fieldExtractor);
            collector.collectRow(transformedRow);
        }
        collector.build();
        return collector;
    } catch (Exception e) {
        LOGGER.error("Caught exception while gathering stats", e);
        Utils.rethrowException(e);
        return null;
    }
}
Also used : SegmentPreIndexStatsCollectorImpl(com.linkedin.pinot.core.segment.creator.impl.stats.SegmentPreIndexStatsCollectorImpl) GenericRow(com.linkedin.pinot.core.data.GenericRow)

Aggregations

GenericRow (com.linkedin.pinot.core.data.GenericRow)45 HashMap (java.util.HashMap)24 File (java.io.File)17 Test (org.testng.annotations.Test)15 Schema (com.linkedin.pinot.common.data.Schema)14 SegmentGeneratorConfig (com.linkedin.pinot.core.indexsegment.generator.SegmentGeneratorConfig)14 ArrayList (java.util.ArrayList)13 SegmentIndexCreationDriverImpl (com.linkedin.pinot.core.segment.creator.impl.SegmentIndexCreationDriverImpl)11 DimensionFieldSpec (com.linkedin.pinot.common.data.DimensionFieldSpec)8 RecordReader (com.linkedin.pinot.core.data.readers.RecordReader)8 Random (java.util.Random)6 JSONObject (org.json.JSONObject)5 FieldSpec (com.linkedin.pinot.common.data.FieldSpec)4 ServerMetrics (com.linkedin.pinot.common.metrics.ServerMetrics)4 MetricFieldSpec (com.linkedin.pinot.common.data.MetricFieldSpec)3 AvroRecordReader (com.linkedin.pinot.core.data.readers.AvroRecordReader)3 PinotSegmentRecordReader (com.linkedin.pinot.core.data.readers.PinotSegmentRecordReader)3 TestRecordReader (com.linkedin.pinot.core.data.readers.TestRecordReader)3 MetricsRegistry (com.yammer.metrics.core.MetricsRegistry)3 BeforeClass (org.testng.annotations.BeforeClass)3