use of com.linkedin.pinot.core.segment.creator.impl.SegmentIndexCreationDriverImpl in project pinot by linkedin.
the class StarTreeIndexTestSegmentHelper method buildSegment.
private static Schema buildSegment(String segmentDirName, String segmentName, HllConfig hllConfig, boolean enableOffHeapFormat) throws Exception {
final int rows = (int) MathUtils.factorial(NUM_DIMENSIONS) * 100;
Schema schema = new Schema();
for (int i = 0; i < NUM_DIMENSIONS; i++) {
String dimName = "d" + (i + 1);
DimensionFieldSpec dimensionFieldSpec = new DimensionFieldSpec(dimName, FieldSpec.DataType.STRING, true);
schema.addField(dimName, dimensionFieldSpec);
}
schema.setTimeFieldSpec(new TimeFieldSpec(TIME_COLUMN_NAME, FieldSpec.DataType.INT, TimeUnit.DAYS));
for (int i = 0; i < NUM_METRICS; i++) {
String metricName = "m" + (i + 1);
MetricFieldSpec metricFieldSpec = new MetricFieldSpec(metricName, FieldSpec.DataType.INT);
schema.addField(metricName, metricFieldSpec);
}
SegmentGeneratorConfig config = new SegmentGeneratorConfig(schema);
config.setEnableStarTreeIndex(true);
config.setOutDir(segmentDirName);
config.setFormat(FileFormat.AVRO);
config.setSegmentName(segmentName);
config.setHllConfig(hllConfig);
config.setStarTreeIndexSpec(buildStarTreeIndexSpec(enableOffHeapFormat));
Random random = new Random(RANDOM_SEED);
final List<GenericRow> data = new ArrayList<>();
for (int row = 0; row < rows; row++) {
HashMap<String, Object> map = new HashMap<>();
// Dim columns.
for (int i = 0; i < NUM_DIMENSIONS / 2; i++) {
String dimName = schema.getDimensionFieldSpecs().get(i).getName();
map.put(dimName, dimName + "-v" + row % (NUM_DIMENSIONS - i));
}
// Random values make cardinality of d3, d4 column values larger to better test hll
for (int i = NUM_DIMENSIONS / 2; i < NUM_DIMENSIONS; i++) {
String dimName = schema.getDimensionFieldSpecs().get(i).getName();
map.put(dimName, dimName + "-v" + random.nextInt(i * 100));
}
// Metric columns.
for (int i = 0; i < NUM_METRICS; i++) {
String metName = schema.getMetricFieldSpecs().get(i).getName();
map.put(metName, random.nextInt(METRIC_MAX_VALUE));
}
// Time column.
map.put(TIME_COLUMN_NAME, row % 7);
GenericRow genericRow = new GenericRow();
genericRow.init(map);
data.add(genericRow);
}
SegmentIndexCreationDriverImpl driver = new SegmentIndexCreationDriverImpl();
RecordReader reader = new TestUtils.GenericRowRecordReader(schema, data);
driver.init(config, reader);
driver.build();
LOGGER.info("Built segment {} at {}", segmentName, segmentDirName);
return schema;
}
use of com.linkedin.pinot.core.segment.creator.impl.SegmentIndexCreationDriverImpl in project pinot by linkedin.
the class NoDictionaryGroupKeyGeneratorTest method buildSegment.
/**
* Helper method to build a segment as follows:
* <ul>
* <li> One string column without dictionary. </li>
* <li> One integer column with dictionary. </li>
* </ul>
*
* It also computes the unique group keys while it generates the index.
*
* @return Set containing unique group keys from the created segment.
*
* @throws Exception
*/
private TestRecordReader buildSegment() throws Exception {
Schema schema = new Schema();
for (int i = 0; i < COLUMN_NAMES.length; i++) {
DimensionFieldSpec dimensionFieldSpec = new DimensionFieldSpec(COLUMN_NAMES[i], DATA_TYPES[i], true);
schema.addField(dimensionFieldSpec);
}
SegmentGeneratorConfig config = new SegmentGeneratorConfig(schema);
config.setRawIndexCreationColumns(Arrays.asList(NO_DICT_COLUMN_NAMES));
config.setOutDir(SEGMENT_DIR_NAME);
config.setSegmentName(SEGMENT_NAME);
Random random = new Random();
List<GenericRow> rows = new ArrayList<>(NUM_ROWS);
for (int i = 0; i < NUM_ROWS; i++) {
Map<String, Object> map = new HashMap<>(NUM_COLUMNS);
for (FieldSpec fieldSpec : schema.getAllFieldSpecs()) {
String column = fieldSpec.getName();
FieldSpec.DataType dataType = fieldSpec.getDataType();
switch(dataType) {
case INT:
map.put(column, random.nextInt());
break;
case LONG:
map.put(column, random.nextLong());
break;
case FLOAT:
map.put(column, random.nextFloat());
break;
case DOUBLE:
map.put(column, random.nextDouble());
break;
case STRING:
map.put(column, "value_" + i);
break;
default:
throw new IllegalArgumentException("Illegal data type specified: " + dataType);
}
}
GenericRow genericRow = new GenericRow();
genericRow.init(map);
rows.add(genericRow);
}
SegmentIndexCreationDriverImpl driver = new SegmentIndexCreationDriverImpl();
_recordReader = new TestRecordReader(rows, schema);
driver.init(config, _recordReader);
driver.build();
return _recordReader;
}
use of com.linkedin.pinot.core.segment.creator.impl.SegmentIndexCreationDriverImpl in project pinot by linkedin.
the class QueryExceptionTest method setupSegmentFor.
private void setupSegmentFor(String table) throws Exception {
final String filePath = TestUtils.getFileFromResourceUrl(getClass().getClassLoader().getResource(AVRO_DATA));
if (INDEX_DIR.exists()) {
FileUtils.deleteQuietly(INDEX_DIR);
}
INDEX_DIR.mkdir();
final SegmentGeneratorConfig config = SegmentTestUtils.getSegmentGenSpecWithSchemAndProjectedColumns(new File(filePath), new File(INDEX_DIR, "segment"), "daysSinceEpoch", TimeUnit.DAYS, table);
final SegmentIndexCreationDriver driver = new SegmentIndexCreationDriverImpl();
driver.init(config);
driver.build();
// System.out.println("built at : " + INDEX_DIR.getAbsolutePath());
}
use of com.linkedin.pinot.core.segment.creator.impl.SegmentIndexCreationDriverImpl in project pinot by linkedin.
the class CreateSegmentCommand method execute.
@Override
public boolean execute() throws Exception {
LOGGER.info("Executing command: {}", toString());
// Load generator config if exist.
final SegmentGeneratorConfig segmentGeneratorConfig;
if (_generatorConfigFile != null) {
segmentGeneratorConfig = new ObjectMapper().readValue(new File(_generatorConfigFile), SegmentGeneratorConfig.class);
} else {
segmentGeneratorConfig = new SegmentGeneratorConfig();
}
// Load config from segment generator config.
String configDataDir = segmentGeneratorConfig.getDataDir();
if (_dataDir == null) {
if (configDataDir == null) {
throw new RuntimeException("Must specify dataDir.");
}
_dataDir = configDataDir;
} else {
if (configDataDir != null && !configDataDir.equals(_dataDir)) {
LOGGER.warn("Find dataDir conflict in command line and config file, use config in command line: {}", _dataDir);
}
}
FileFormat configFormat = segmentGeneratorConfig.getFormat();
if (_format == null) {
if (configFormat == null) {
throw new RuntimeException("Format cannot be null in config file.");
}
_format = configFormat;
} else {
if (configFormat != _format && configFormat != FileFormat.AVRO) {
LOGGER.warn("Find format conflict in command line and config file, use config in command line: {}", _format);
}
}
String configOutDir = segmentGeneratorConfig.getOutDir();
if (_outDir == null) {
if (configOutDir == null) {
throw new RuntimeException("Must specify outDir.");
}
_outDir = configOutDir;
} else {
if (configOutDir != null && !configOutDir.equals(_outDir)) {
LOGGER.warn("Find outDir conflict in command line and config file, use config in command line: {}", _outDir);
}
}
if (segmentGeneratorConfig.isOverwrite()) {
_overwrite = true;
}
String configTableName = segmentGeneratorConfig.getTableName();
if (_tableName == null) {
if (configTableName == null) {
throw new RuntimeException("Must specify tableName.");
}
_tableName = configTableName;
} else {
if (configTableName != null && !configTableName.equals(_tableName)) {
LOGGER.warn("Find tableName conflict in command line and config file, use config in command line: {}", _tableName);
}
}
String configSegmentName = segmentGeneratorConfig.getSegmentName();
if (_segmentName == null) {
if (configSegmentName == null) {
throw new RuntimeException("Must specify segmentName.");
}
_segmentName = configSegmentName;
} else {
if (configSegmentName != null && !configSegmentName.equals(_segmentName)) {
LOGGER.warn("Find segmentName conflict in command line and config file, use config in command line: {}", _segmentName);
}
}
// Filter out all input files.
File dir = new File(_dataDir);
if (!dir.exists() || !dir.isDirectory()) {
throw new RuntimeException("Data directory " + _dataDir + " not found.");
}
File[] files = dir.listFiles(new FilenameFilter() {
@Override
public boolean accept(File dir, String name) {
return name.toLowerCase().endsWith(_format.toString().toLowerCase());
}
});
if ((files == null) || (files.length == 0)) {
throw new RuntimeException("Data directory " + _dataDir + " does not contain " + _format.toString().toUpperCase() + " files.");
}
// Make sure output directory does not already exist, or can be overwritten.
File outDir = new File(_outDir);
if (outDir.exists()) {
if (!_overwrite) {
throw new IOException("Output directory " + _outDir + " already exists.");
} else {
FileUtils.deleteDirectory(outDir);
}
}
// Set other generator configs from command line.
segmentGeneratorConfig.setDataDir(_dataDir);
segmentGeneratorConfig.setFormat(_format);
segmentGeneratorConfig.setOutDir(_outDir);
segmentGeneratorConfig.setOverwrite(_overwrite);
segmentGeneratorConfig.setTableName(_tableName);
segmentGeneratorConfig.setSegmentName(_segmentName);
if (_schemaFile != null) {
if (segmentGeneratorConfig.getSchemaFile() != null && !segmentGeneratorConfig.getSchemaFile().equals(_schemaFile)) {
LOGGER.warn("Find schemaFile conflict in command line and config file, use config in command line: {}", _schemaFile);
}
segmentGeneratorConfig.setSchemaFile(_schemaFile);
}
if (_readerConfigFile != null) {
if (segmentGeneratorConfig.getReaderConfigFile() != null && !segmentGeneratorConfig.getReaderConfigFile().equals(_readerConfigFile)) {
LOGGER.warn("Find readerConfigFile conflict in command line and config file, use config in command line: {}", _readerConfigFile);
}
segmentGeneratorConfig.setReaderConfigFile(_readerConfigFile);
}
if (_enableStarTreeIndex) {
segmentGeneratorConfig.setEnableStarTreeIndex(true);
}
if (_starTreeIndexSpecFile != null) {
if (segmentGeneratorConfig.getStarTreeIndexSpecFile() != null && !segmentGeneratorConfig.getStarTreeIndexSpecFile().equals(_starTreeIndexSpecFile)) {
LOGGER.warn("Find starTreeIndexSpecFile conflict in command line and config file, use config in command line: {}", _starTreeIndexSpecFile);
}
segmentGeneratorConfig.setStarTreeIndexSpecFile(_starTreeIndexSpecFile);
}
ExecutorService executor = Executors.newFixedThreadPool(_numThreads);
int cnt = 0;
for (final File file : files) {
final int segCnt = cnt;
executor.execute(new Runnable() {
@Override
public void run() {
try {
SegmentGeneratorConfig config = new SegmentGeneratorConfig(segmentGeneratorConfig);
config.setInputFilePath(file.getAbsolutePath());
config.setSegmentName(_segmentName + "_" + segCnt);
config.loadConfigFiles();
final SegmentIndexCreationDriverImpl driver = new SegmentIndexCreationDriverImpl();
driver.init(config);
driver.build();
} catch (Exception e) {
throw new RuntimeException(e);
}
}
});
cnt += 1;
}
executor.shutdown();
return executor.awaitTermination(1, TimeUnit.HOURS);
}
use of com.linkedin.pinot.core.segment.creator.impl.SegmentIndexCreationDriverImpl in project pinot by linkedin.
the class ColumnarToStarTreeConverter method convertSegment.
/**
* Helper method to perform the conversion.
* @param columnarSegment Columnar segment directory to convert
* @throws Exception
*/
private void convertSegment(File columnarSegment) throws Exception {
PinotSegmentRecordReader pinotSegmentRecordReader = new PinotSegmentRecordReader(columnarSegment);
SegmentGeneratorConfig config = new SegmentGeneratorConfig(pinotSegmentRecordReader.getSchema());
config.setDataDir(_inputDirName);
config.setInputFilePath(columnarSegment.getAbsolutePath());
config.setFormat(FileFormat.PINOT);
config.setEnableStarTreeIndex(true);
config.setOutDir(_outputDirName);
config.setStarTreeIndexSpecFile(_starTreeConfigFileName);
config.setOverwrite(_overwrite);
config.setSegmentName(columnarSegment.getName());
SegmentIndexCreationDriver indexCreator = new SegmentIndexCreationDriverImpl();
indexCreator.init(config);
indexCreator.build();
}
Aggregations