use of com.linkedin.pinot.core.data.GenericRow in project pinot by linkedin.
the class TransformGroupByTest method buildSegment.
/**
* Helper method to build a segment with one dimension column containing values
* from {@link #_dimensionValues}, and one metric column.
*
* Also builds the expected group by result as it builds the segments.
*
* @param segmentDirName Name of segment directory
* @param segmentName Name of segment
* @param schema Schema for segment
* @return Schema built for the segment
* @throws Exception
*/
private RecordReader buildSegment(String segmentDirName, String segmentName, Schema schema) throws Exception {
SegmentGeneratorConfig config = new SegmentGeneratorConfig(schema);
config.setOutDir(segmentDirName);
config.setFormat(FileFormat.AVRO);
config.setTableName(TABLE_NAME);
config.setSegmentName(segmentName);
Random random = new Random(RANDOM_SEED);
long currentTimeMillis = System.currentTimeMillis();
// Divide the day into fixed parts, and decrement time column value by this delta, so as to get
// continuous days in the input. This gives about 10 days per 10k rows.
long timeDelta = TimeUnit.MILLISECONDS.convert(1, TimeUnit.DAYS) / 1000;
final List<GenericRow> data = new ArrayList<>();
int numDimValues = _dimensionValues.length;
for (int row = 0; row < NUM_ROWS; row++) {
HashMap<String, Object> map = new HashMap<>();
map.put(DIMENSION_NAME, _dimensionValues[random.nextInt(numDimValues)]);
map.put(METRIC_NAME, random.nextDouble());
map.put(TIME_COLUMN_NAME, currentTimeMillis);
currentTimeMillis -= timeDelta;
GenericRow genericRow = new GenericRow();
genericRow.init(map);
data.add(genericRow);
}
SegmentIndexCreationDriverImpl driver = new SegmentIndexCreationDriverImpl();
RecordReader reader = new TestUtils.GenericRowRecordReader(schema, data);
driver.init(config, reader);
driver.build();
LOGGER.info("Built segment {} at {}", segmentName, segmentDirName);
return reader;
}
use of com.linkedin.pinot.core.data.GenericRow in project pinot by linkedin.
the class TransformExpressionOperatorTest method buildSegment.
/**
* Helper method to build a segment with {@link #NUM_METRICS} metrics with random
* data as per the schema.
*
* @param segmentDirName Name of segment directory
* @param segmentName Name of segment
* @param schema Schema for segment
* @return Schema built for the segment
* @throws Exception
*/
private Schema buildSegment(String segmentDirName, String segmentName, Schema schema) throws Exception {
SegmentGeneratorConfig config = new SegmentGeneratorConfig(schema);
config.setOutDir(segmentDirName);
config.setFormat(FileFormat.AVRO);
config.setSegmentName(segmentName);
Random random = new Random(RANDOM_SEED);
final List<GenericRow> data = new ArrayList<>();
_values = new double[NUM_ROWS][NUM_METRICS];
for (int row = 0; row < NUM_ROWS; row++) {
HashMap<String, Object> map = new HashMap<>();
// Metric columns.
for (int i = 0; i < NUM_METRICS; i++) {
String metName = schema.getMetricFieldSpecs().get(i).getName();
double value = random.nextInt(MAX_METRIC_VALUE) + random.nextDouble() + 1.0;
map.put(metName, value);
_values[row][i] = value;
}
GenericRow genericRow = new GenericRow();
genericRow.init(map);
data.add(genericRow);
}
SegmentIndexCreationDriverImpl driver = new SegmentIndexCreationDriverImpl();
RecordReader reader = new TestUtils.GenericRowRecordReader(schema, data);
driver.init(config, reader);
driver.build();
LOGGER.info("Built segment {} at {}", segmentName, segmentDirName);
return schema;
}
use of com.linkedin.pinot.core.data.GenericRow in project pinot by linkedin.
the class SegmentIndexCreationDriverImpl method buildStarTree.
private void buildStarTree() throws Exception {
// Create stats collector
SegmentPreIndexStatsCollectorImpl statsCollector = new SegmentPreIndexStatsCollectorImpl(dataSchema);
statsCollector.init();
segmentStats = statsCollector;
long start = System.currentTimeMillis();
//construct star tree builder config
StarTreeIndexSpec starTreeIndexSpec = config.getStarTreeIndexSpec();
if (starTreeIndexSpec == null) {
starTreeIndexSpec = new StarTreeIndexSpec();
starTreeIndexSpec.setMaxLeafRecords(StarTreeIndexSpec.DEFAULT_MAX_LEAF_RECORDS);
config.setStarTreeIndexSpec(starTreeIndexSpec);
}
List<String> dimensionsSplitOrder = starTreeIndexSpec.getDimensionsSplitOrder();
if (dimensionsSplitOrder != null && !dimensionsSplitOrder.isEmpty()) {
final String timeColumnName = config.getTimeColumnName();
if (timeColumnName != null) {
dimensionsSplitOrder.remove(timeColumnName);
}
}
//create star builder config from startreeindexspec. Merge these two in one later.
StarTreeBuilderConfig starTreeBuilderConfig = new StarTreeBuilderConfig();
starTreeBuilderConfig.setSchema(dataSchema);
starTreeBuilderConfig.setDimensionsSplitOrder(dimensionsSplitOrder);
starTreeBuilderConfig.setMaxLeafRecords(starTreeIndexSpec.getMaxLeafRecords());
starTreeBuilderConfig.setSkipStarNodeCreationForDimensions(starTreeIndexSpec.getSkipStarNodeCreationForDimensions());
Set<String> skipMaterializationForDimensions = starTreeIndexSpec.getskipMaterializationForDimensions();
starTreeBuilderConfig.setSkipMaterializationForDimensions(skipMaterializationForDimensions);
starTreeBuilderConfig.setSkipMaterializationCardinalityThreshold(starTreeIndexSpec.getskipMaterializationCardinalityThreshold());
starTreeBuilderConfig.setOutDir(starTreeTempDir);
boolean enableOffHeapFormat = starTreeIndexSpec.isEnableOffHeapFormat();
starTreeBuilderConfig.setEnableOffHealpFormat(enableOffHeapFormat);
//initialize star tree builder
StarTreeBuilder starTreeBuilder = new OffHeapStarTreeBuilder();
starTreeBuilder.init(starTreeBuilderConfig);
//build star tree along with collecting stats
recordReader.rewind();
LOGGER.info("Start append raw data to star tree builder!");
totalDocs = 0;
GenericRow readRow = new GenericRow();
GenericRow transformedRow = new GenericRow();
while (recordReader.hasNext()) {
//PlainFieldExtractor conducts necessary type conversions
transformedRow = readNextRowSanitized(readRow, transformedRow);
//must be called after previous step since type conversion for derived values is unnecessary
populateDefaultDerivedColumnValues(transformedRow);
starTreeBuilder.append(transformedRow);
statsCollector.collectRow(transformedRow);
totalRawDocs++;
totalDocs++;
}
recordReader.close();
LOGGER.info("Start building star tree!");
starTreeBuilder.build();
LOGGER.info("Finished building star tree!");
long starTreeBuildFinishTime = System.currentTimeMillis();
//build stats
// Count the number of documents and gather per-column statistics
LOGGER.info("Start building StatsCollector!");
Iterator<GenericRow> aggregatedRowsIterator = starTreeBuilder.iterator(starTreeBuilder.getTotalRawDocumentCount(), starTreeBuilder.getTotalRawDocumentCount() + starTreeBuilder.getTotalAggregateDocumentCount());
while (aggregatedRowsIterator.hasNext()) {
GenericRow genericRow = aggregatedRowsIterator.next();
statsCollector.collectRow(genericRow, true);
totalAggDocs++;
totalDocs++;
}
statsCollector.build();
buildIndexCreationInfo();
LOGGER.info("Collected stats for {} raw documents, {} aggregated documents", totalRawDocs, totalAggDocs);
long statCollectionFinishTime = System.currentTimeMillis();
// Initialize the index creation using the per-column statistics information
indexCreator.init(config, segmentIndexCreationInfo, indexCreationInfoMap, dataSchema, tempIndexDir);
//iterate over the data again,
Iterator<GenericRow> allRowsIterator = starTreeBuilder.iterator(0, starTreeBuilder.getTotalRawDocumentCount() + starTreeBuilder.getTotalAggregateDocumentCount());
while (allRowsIterator.hasNext()) {
GenericRow genericRow = allRowsIterator.next();
indexCreator.indexRow(genericRow);
}
// This is required so the dimensionsSplitOrder used by the builder can be written into the segment metadata.
if (dimensionsSplitOrder == null || dimensionsSplitOrder.isEmpty()) {
starTreeIndexSpec.setDimensionsSplitOrder(starTreeBuilder.getDimensionsSplitOrder());
}
if (skipMaterializationForDimensions == null || skipMaterializationForDimensions.isEmpty()) {
starTreeIndexSpec.setSkipMaterializationForDimensions(starTreeBuilder.getSkipMaterializationForDimensions());
}
serializeTree(starTreeBuilder, enableOffHeapFormat);
//post creation
handlePostCreation();
starTreeBuilder.cleanup();
long end = System.currentTimeMillis();
LOGGER.info("Total time:{} \n star tree build time:{} \n stat collection time:{} \n column index build time:{}", (end - start), (starTreeBuildFinishTime - start), statCollectionFinishTime - starTreeBuildFinishTime, end - statCollectionFinishTime);
}
use of com.linkedin.pinot.core.data.GenericRow in project pinot by linkedin.
the class IndexSegmentImpl method iterator.
public Iterator<GenericRow> iterator(final int startDocId, final int endDocId) {
final Map<String, BlockSingleValIterator> singleValIteratorMap = new HashMap<>();
final Map<String, BlockMultiValIterator> multiValIteratorMap = new HashMap<>();
for (String column : getColumnNames()) {
DataSource dataSource = getDataSource(column);
BlockValIterator iterator = dataSource.getNextBlock().getBlockValueSet().iterator();
if (dataSource.getDataSourceMetadata().isSingleValue()) {
singleValIteratorMap.put(column, (BlockSingleValIterator) iterator);
} else {
multiValIteratorMap.put(column, (BlockMultiValIterator) iterator);
}
}
return new Iterator<GenericRow>() {
int docId = startDocId;
@Override
public boolean hasNext() {
return docId < endDocId;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
@Override
public GenericRow next() {
Map<String, Object> map = new HashMap<>();
for (String column : singleValIteratorMap.keySet()) {
int dictId = singleValIteratorMap.get(column).nextIntVal();
Dictionary dictionary = getDictionaryFor(column);
map.put(column, dictionary.get(dictId));
}
for (String column : multiValIteratorMap.keySet()) {
//TODO:handle multi value
}
GenericRow genericRow = new GenericRow();
genericRow.init(map);
docId++;
return genericRow;
}
};
}
use of com.linkedin.pinot.core.data.GenericRow in project pinot by linkedin.
the class RecordReaderSegmentCreationDataSource method gatherStats.
@Override
public SegmentPreIndexStatsCollector gatherStats(FieldExtractor fieldExtractor) {
try {
SegmentPreIndexStatsCollector collector = new SegmentPreIndexStatsCollectorImpl(_recordReader.getSchema());
collector.init();
// Gather the stats
GenericRow readRow = new GenericRow();
GenericRow transformedRow = new GenericRow();
while (_recordReader.hasNext()) {
transformedRow = readNextRowSanitized(readRow, transformedRow, fieldExtractor);
collector.collectRow(transformedRow);
}
collector.build();
return collector;
} catch (Exception e) {
LOGGER.error("Caught exception while gathering stats", e);
Utils.rethrowException(e);
return null;
}
}
Aggregations