Search in sources :

Example 1 with DimensionFieldSpec

use of com.linkedin.pinot.common.data.DimensionFieldSpec in project pinot by linkedin.

the class AvroUtils method getPinotSchemaFromAvroSchema.

/**
   * Given an avro schema object along with column field types and time unit, return the equivalent
   * pinot schema object.
   *
   * @param avroSchema Avro schema for which to get the Pinot schema.
   * @param fieldTypes Map containing fieldTypes for each column.
   * @param timeUnit Time unit to be used for the time column.
   * @return Return the equivalent pinot schema for the given avro schema.
   */
private static Schema getPinotSchemaFromAvroSchema(org.apache.avro.Schema avroSchema, Map<String, FieldSpec.FieldType> fieldTypes, TimeUnit timeUnit) {
    Schema pinotSchema = new Schema();
    for (final Field field : avroSchema.getFields()) {
        String fieldName = field.name();
        FieldSpec.DataType dataType;
        try {
            dataType = AvroRecordReader.getColumnType(field);
        } catch (UnsupportedOperationException e) {
            LOGGER.warn("Unsupported field type for field {} schema {}, using String instead.", fieldName, field.schema());
            dataType = FieldSpec.DataType.STRING;
        }
        FieldSpec.FieldType fieldType = fieldTypes.get(fieldName);
        boolean isSingleValueField = AvroRecordReader.isSingleValueField(field);
        switch(fieldType) {
            case DIMENSION:
                pinotSchema.addField(new DimensionFieldSpec(fieldName, dataType, isSingleValueField));
                break;
            case METRIC:
                Preconditions.checkState(isSingleValueField, "Unsupported multi-value for metric field.");
                pinotSchema.addField(new MetricFieldSpec(fieldName, dataType));
                break;
            case TIME:
                Preconditions.checkState(isSingleValueField, "Unsupported multi-value for time field.");
                pinotSchema.addField(new TimeFieldSpec(field.name(), dataType, timeUnit));
                break;
            default:
                throw new UnsupportedOperationException("Unsupported field type: " + fieldType + " for field: " + fieldName);
        }
    }
    return pinotSchema;
}
Also used : Field(org.apache.avro.Schema.Field) Schema(com.linkedin.pinot.common.data.Schema) TimeFieldSpec(com.linkedin.pinot.common.data.TimeFieldSpec) MetricFieldSpec(com.linkedin.pinot.common.data.MetricFieldSpec) MetricFieldSpec(com.linkedin.pinot.common.data.MetricFieldSpec) DimensionFieldSpec(com.linkedin.pinot.common.data.DimensionFieldSpec) TimeFieldSpec(com.linkedin.pinot.common.data.TimeFieldSpec) FieldSpec(com.linkedin.pinot.common.data.FieldSpec) DimensionFieldSpec(com.linkedin.pinot.common.data.DimensionFieldSpec)

Example 2 with DimensionFieldSpec

use of com.linkedin.pinot.common.data.DimensionFieldSpec in project pinot by linkedin.

the class OffHeapStarTreeBuilder method init.

public void init(StarTreeBuilderConfig builderConfig) throws Exception {
    schema = builderConfig.schema;
    timeColumnName = schema.getTimeColumnName();
    this.dimensionsSplitOrder = builderConfig.dimensionsSplitOrder;
    skipStarNodeCreationForDimensions = builderConfig.getSkipStarNodeCreationForDimensions();
    skipMaterializationForDimensions = builderConfig.getSkipMaterializationForDimensions();
    skipMaterializationCardinalityThreshold = builderConfig.getSkipMaterializationCardinalityThreshold();
    enableOffHeapFormat = builderConfig.isEnableOffHealpFormat();
    this.maxLeafRecords = builderConfig.maxLeafRecords;
    this.outDir = builderConfig.getOutDir();
    if (outDir == null) {
        outDir = new File(System.getProperty("java.io.tmpdir"), V1Constants.STAR_TREE_INDEX_DIR + "_" + DateTime.now());
    }
    LOG.info("Index output directory:{}", outDir);
    dimensionTypes = new ArrayList<>();
    dimensionNames = new ArrayList<>();
    dimensionNameToIndexMap = HashBiMap.create();
    dimensionNameToStarValueMap = new HashMap<>();
    dictionaryMap = new HashMap<>();
    // READ DIMENSIONS COLUMNS
    List<DimensionFieldSpec> dimensionFieldSpecs = schema.getDimensionFieldSpecs();
    for (int index = 0; index < dimensionFieldSpecs.size(); index++) {
        DimensionFieldSpec spec = dimensionFieldSpecs.get(index);
        String dimensionName = spec.getName();
        dimensionNames.add(dimensionName);
        dimensionNameToIndexMap.put(dimensionName, index);
        Object starValue;
        starValue = getAllStarValue(spec);
        dimensionNameToStarValueMap.put(dimensionName, starValue);
        dimensionTypes.add(spec.getDataType());
        HashBiMap<Object, Integer> dictionary = HashBiMap.create();
        dictionaryMap.put(dimensionName, dictionary);
    }
    // this dimension unless explicitly specified in split order
    if (timeColumnName != null) {
        dimensionNames.add(timeColumnName);
        TimeFieldSpec timeFieldSpec = schema.getTimeFieldSpec();
        dimensionTypes.add(timeFieldSpec.getDataType());
        int index = dimensionNameToIndexMap.size();
        dimensionNameToIndexMap.put(timeColumnName, index);
        Object starValue;
        starValue = getAllStarValue(timeFieldSpec);
        dimensionNameToStarValueMap.put(timeColumnName, starValue);
        HashBiMap<Object, Integer> dictionary = HashBiMap.create();
        dictionaryMap.put(schema.getTimeColumnName(), dictionary);
    }
    dimensionSizeBytes = dimensionNames.size() * Integer.SIZE / 8;
    this.numDimensions = dimensionNames.size();
    // READ METRIC COLUMNS
    this.metricNames = new ArrayList<>();
    this.metricNameToIndexMap = new HashMap<>();
    this.metricSizeBytes = 0;
    List<MetricFieldSpec> metricFieldSpecs = schema.getMetricFieldSpecs();
    for (int index = 0; index < metricFieldSpecs.size(); index++) {
        MetricFieldSpec spec = metricFieldSpecs.get(index);
        String metricName = spec.getName();
        metricNames.add(metricName);
        metricNameToIndexMap.put(metricName, index);
        metricSizeBytes += spec.getFieldSize();
    }
    numMetrics = metricNames.size();
    builderConfig.getOutDir().mkdirs();
    dataFile = new File(outDir, "star-tree.buf");
    LOG.info("StarTree output data file: {}", dataFile.getAbsolutePath());
    dataBuffer = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(dataFile)));
    // INITIALIZE THE ROOT NODE
    this.starTreeRootIndexNode = new StarTreeIndexNode();
    this.starTreeRootIndexNode.setDimensionName(StarTreeIndexNodeInterf.ALL);
    this.starTreeRootIndexNode.setDimensionValue(StarTreeIndexNodeInterf.ALL);
    this.starTreeRootIndexNode.setLevel(0);
    LOG.info("dimensionNames:{}", dimensionNames);
    LOG.info("metricNames:{}", metricNames);
}
Also used : DataOutputStream(java.io.DataOutputStream) TimeFieldSpec(com.linkedin.pinot.common.data.TimeFieldSpec) MetricFieldSpec(com.linkedin.pinot.common.data.MetricFieldSpec) FileOutputStream(java.io.FileOutputStream) JSONObject(org.json.JSONObject) File(java.io.File) BufferedOutputStream(java.io.BufferedOutputStream) DimensionFieldSpec(com.linkedin.pinot.common.data.DimensionFieldSpec)

Example 3 with DimensionFieldSpec

use of com.linkedin.pinot.common.data.DimensionFieldSpec in project pinot by linkedin.

the class RawIndexCreatorTest method setup.

/**
   * Setup to build a segment with raw indexes (no-dictionary) of various data types.
   *
   * @throws Exception
   */
@BeforeClass
public void setup() throws Exception {
    Schema schema = new Schema();
    schema.addField(new DimensionFieldSpec(INT_COLUMN, FieldSpec.DataType.INT, true));
    schema.addField(new DimensionFieldSpec(LONG_COLUMN, FieldSpec.DataType.LONG, true));
    schema.addField(new DimensionFieldSpec(FLOAT_COLUMN, FieldSpec.DataType.FLOAT, true));
    schema.addField(new DimensionFieldSpec(DOUBLE_COLUMN, FieldSpec.DataType.DOUBLE, true));
    schema.addField(new DimensionFieldSpec(STRING_COLUMN, FieldSpec.DataType.STRING, true));
    _random = new Random(System.nanoTime());
    _recordReader = buildIndex(schema);
}
Also used : Random(java.util.Random) Schema(com.linkedin.pinot.common.data.Schema) DimensionFieldSpec(com.linkedin.pinot.common.data.DimensionFieldSpec) BeforeClass(org.testng.annotations.BeforeClass)

Example 4 with DimensionFieldSpec

use of com.linkedin.pinot.common.data.DimensionFieldSpec in project pinot by linkedin.

the class NoDictionaryGroupKeyGeneratorTest method buildSegment.

/**
   * Helper method to build a segment as follows:
   * <ul>
   *   <li> One string column without dictionary. </li>
   *   <li> One integer column with dictionary. </li>
   * </ul>
   *
   * It also computes the unique group keys while it generates the index.
   *
   * @return Set containing unique group keys from the created segment.
   *
   * @throws Exception
   */
private TestRecordReader buildSegment() throws Exception {
    Schema schema = new Schema();
    for (int i = 0; i < COLUMN_NAMES.length; i++) {
        DimensionFieldSpec dimensionFieldSpec = new DimensionFieldSpec(COLUMN_NAMES[i], DATA_TYPES[i], true);
        schema.addField(dimensionFieldSpec);
    }
    SegmentGeneratorConfig config = new SegmentGeneratorConfig(schema);
    config.setRawIndexCreationColumns(Arrays.asList(NO_DICT_COLUMN_NAMES));
    config.setOutDir(SEGMENT_DIR_NAME);
    config.setSegmentName(SEGMENT_NAME);
    Random random = new Random();
    List<GenericRow> rows = new ArrayList<>(NUM_ROWS);
    for (int i = 0; i < NUM_ROWS; i++) {
        Map<String, Object> map = new HashMap<>(NUM_COLUMNS);
        for (FieldSpec fieldSpec : schema.getAllFieldSpecs()) {
            String column = fieldSpec.getName();
            FieldSpec.DataType dataType = fieldSpec.getDataType();
            switch(dataType) {
                case INT:
                    map.put(column, random.nextInt());
                    break;
                case LONG:
                    map.put(column, random.nextLong());
                    break;
                case FLOAT:
                    map.put(column, random.nextFloat());
                    break;
                case DOUBLE:
                    map.put(column, random.nextDouble());
                    break;
                case STRING:
                    map.put(column, "value_" + i);
                    break;
                default:
                    throw new IllegalArgumentException("Illegal data type specified: " + dataType);
            }
        }
        GenericRow genericRow = new GenericRow();
        genericRow.init(map);
        rows.add(genericRow);
    }
    SegmentIndexCreationDriverImpl driver = new SegmentIndexCreationDriverImpl();
    _recordReader = new TestRecordReader(rows, schema);
    driver.init(config, _recordReader);
    driver.build();
    return _recordReader;
}
Also used : TestRecordReader(com.linkedin.pinot.core.data.readers.TestRecordReader) HashMap(java.util.HashMap) Schema(com.linkedin.pinot.common.data.Schema) ArrayList(java.util.ArrayList) FieldSpec(com.linkedin.pinot.common.data.FieldSpec) DimensionFieldSpec(com.linkedin.pinot.common.data.DimensionFieldSpec) SegmentIndexCreationDriverImpl(com.linkedin.pinot.core.segment.creator.impl.SegmentIndexCreationDriverImpl) GenericRow(com.linkedin.pinot.core.data.GenericRow) Random(java.util.Random) SegmentGeneratorConfig(com.linkedin.pinot.core.indexsegment.generator.SegmentGeneratorConfig) DimensionFieldSpec(com.linkedin.pinot.common.data.DimensionFieldSpec)

Example 5 with DimensionFieldSpec

use of com.linkedin.pinot.common.data.DimensionFieldSpec in project pinot by linkedin.

the class DictionariesTest method testIntColumnPreIndexStatsCollector.

@Test
public void testIntColumnPreIndexStatsCollector() throws Exception {
    FieldSpec spec = new DimensionFieldSpec("column1", DataType.INT, true);
    AbstractColumnStatisticsCollector statsCollector = new IntColumnPreIndexStatsCollector(spec);
    statsCollector.collect(new Integer(1));
    Assert.assertTrue(statsCollector.isSorted());
    statsCollector.collect(new Float(2));
    Assert.assertTrue(statsCollector.isSorted());
    statsCollector.collect(new Long(3));
    Assert.assertTrue(statsCollector.isSorted());
    statsCollector.collect(new Double(4));
    Assert.assertTrue(statsCollector.isSorted());
    statsCollector.collect(new Integer(4));
    Assert.assertTrue(statsCollector.isSorted());
    statsCollector.collect(new Float(2));
    Assert.assertFalse(statsCollector.isSorted());
    statsCollector.collect(new Double(40));
    Assert.assertFalse(statsCollector.isSorted());
    statsCollector.collect(new Double(20));
    Assert.assertFalse(statsCollector.isSorted());
    statsCollector.seal();
    Assert.assertEquals(statsCollector.getCardinality(), 6);
    Assert.assertEquals(((Number) statsCollector.getMinValue()).intValue(), 1);
    Assert.assertEquals(((Number) statsCollector.getMaxValue()).intValue(), 40);
    Assert.assertFalse(statsCollector.isSorted());
}
Also used : IntColumnPreIndexStatsCollector(com.linkedin.pinot.core.segment.creator.impl.stats.IntColumnPreIndexStatsCollector) AbstractColumnStatisticsCollector(com.linkedin.pinot.core.segment.creator.AbstractColumnStatisticsCollector) FieldSpec(com.linkedin.pinot.common.data.FieldSpec) DimensionFieldSpec(com.linkedin.pinot.common.data.DimensionFieldSpec) DimensionFieldSpec(com.linkedin.pinot.common.data.DimensionFieldSpec) Test(org.testng.annotations.Test)

Aggregations

DimensionFieldSpec (com.linkedin.pinot.common.data.DimensionFieldSpec)38 FieldSpec (com.linkedin.pinot.common.data.FieldSpec)27 Schema (com.linkedin.pinot.common.data.Schema)18 Test (org.testng.annotations.Test)17 MetricFieldSpec (com.linkedin.pinot.common.data.MetricFieldSpec)16 File (java.io.File)16 TimeFieldSpec (com.linkedin.pinot.common.data.TimeFieldSpec)13 HashMap (java.util.HashMap)9 GenericRow (com.linkedin.pinot.core.data.GenericRow)7 Random (java.util.Random)7 TimeGranularitySpec (com.linkedin.pinot.common.data.TimeGranularitySpec)6 AbstractColumnStatisticsCollector (com.linkedin.pinot.core.segment.creator.AbstractColumnStatisticsCollector)6 SegmentDictionaryCreator (com.linkedin.pinot.core.segment.creator.impl.SegmentDictionaryCreator)6 SegmentGeneratorConfig (com.linkedin.pinot.core.indexsegment.generator.SegmentGeneratorConfig)5 SegmentIndexCreationDriverImpl (com.linkedin.pinot.core.segment.creator.impl.SegmentIndexCreationDriverImpl)5 ArrayList (java.util.ArrayList)4 FieldType (com.linkedin.pinot.common.data.FieldSpec.FieldType)3 IndexSegment (com.linkedin.pinot.core.indexsegment.IndexSegment)3 DataType (com.linkedin.pinot.common.data.FieldSpec.DataType)2 RecordReader (com.linkedin.pinot.core.data.readers.RecordReader)2