Search in sources :

Example 11 with FixedByteSingleValueMultiColWriter

use of com.linkedin.pinot.core.io.writer.impl.FixedByteSingleValueMultiColWriter in project pinot by linkedin.

the class SegmentDictionaryCreator method build.

public void build(boolean[] isSorted) throws Exception {
    switch(spec.getDataType()) {
        case INT:
            final FixedByteSingleValueMultiColWriter intDictionaryWrite = new FixedByteSingleValueMultiColWriter(dictionaryFile, rowCount, 1, V1Constants.Dict.INT_DICTIONARY_COL_SIZE);
            intValueToIndexMap = new Int2IntOpenHashMap(rowCount);
            int[] sortedInts = (int[]) sortedList;
            for (int i = 0; i < rowCount; i++) {
                final int entry = sortedInts[i];
                intDictionaryWrite.setInt(i, 0, entry);
                intValueToIndexMap.put(entry, i);
            }
            intDictionaryWrite.close();
            break;
        case FLOAT:
            final FixedByteSingleValueMultiColWriter floatDictionaryWrite = new FixedByteSingleValueMultiColWriter(dictionaryFile, rowCount, 1, V1Constants.Dict.FLOAT_DICTIONARY_COL_SIZE);
            floatValueToIndexMap = new Float2IntOpenHashMap(rowCount);
            float[] sortedFloats = (float[]) sortedList;
            for (int i = 0; i < rowCount; i++) {
                final float entry = sortedFloats[i];
                floatDictionaryWrite.setFloat(i, 0, entry);
                floatValueToIndexMap.put(entry, i);
            }
            floatDictionaryWrite.close();
            break;
        case LONG:
            final FixedByteSingleValueMultiColWriter longDictionaryWrite = new FixedByteSingleValueMultiColWriter(dictionaryFile, rowCount, 1, V1Constants.Dict.LONG_DICTIONARY_COL_SIZE);
            longValueToIndexMap = new Long2IntOpenHashMap(rowCount);
            long[] sortedLongs = (long[]) sortedList;
            for (int i = 0; i < rowCount; i++) {
                final long entry = sortedLongs[i];
                longDictionaryWrite.setLong(i, 0, entry);
                longValueToIndexMap.put(entry, i);
            }
            longDictionaryWrite.close();
            break;
        case DOUBLE:
            final FixedByteSingleValueMultiColWriter doubleDictionaryWrite = new FixedByteSingleValueMultiColWriter(dictionaryFile, rowCount, 1, V1Constants.Dict.DOUBLE_DICTIONARY_COL_SIZE);
            doubleValueToIndexMap = new Double2IntOpenHashMap(rowCount);
            double[] sortedDoubles = (double[]) sortedList;
            for (int i = 0; i < rowCount; i++) {
                final double entry = sortedDoubles[i];
                doubleDictionaryWrite.setDouble(i, 0, entry);
                doubleValueToIndexMap.put(entry, i);
            }
            doubleDictionaryWrite.close();
            break;
        case STRING:
        case BOOLEAN:
            Object[] sortedObjects = (Object[]) sortedList;
            // make sure that there is non-zero sized dictionary JIRA:PINOT-2947
            stringColumnMaxLength = 1;
            for (final Object e : sortedObjects) {
                String val = e.toString();
                int length = val.getBytes(utf8CharSet).length;
                if (stringColumnMaxLength < length) {
                    stringColumnMaxLength = length;
                }
            }
            final FixedByteSingleValueMultiColWriter stringDictionaryWrite = new FixedByteSingleValueMultiColWriter(dictionaryFile, rowCount, 1, new int[] { stringColumnMaxLength });
            final String[] revised = new String[rowCount];
            Map<String, String> revisedMap = new HashMap<String, String>();
            for (int i = 0; i < rowCount; i++) {
                final String toWrite = sortedObjects[i].toString();
                String entry = getPaddedString(toWrite, stringColumnMaxLength, paddingChar);
                revised[i] = entry;
                if (isSorted[0] && i > 0 && (revised[i - 1].compareTo(entry) > 0)) {
                    isSorted[0] = false;
                }
                assert (revised[i].getBytes(utf8CharSet).length == stringColumnMaxLength);
                revisedMap.put(revised[i], toWrite);
            }
            if (revisedMap.size() != sortedObjects.length) {
                // Two strings map to the same padded string in the current column
                throw new RuntimeException("Number of entries in dictionary != number of unique values in the data in column " + spec.getName());
            }
            Arrays.sort(revised);
            stringValueToIndexMap = new Object2IntOpenHashMap<>(rowCount);
            for (int i = 0; i < revised.length; i++) {
                stringDictionaryWrite.setString(i, 0, revised[i]);
                // No need to store padded value, we can store and lookup by raw value. In certain cases, original sorted order
                // may be different from revised sorted order [PINOT-2730], so would need to use the original order in value
                // to index map.
                String origString = revisedMap.get(revised[i]);
                stringValueToIndexMap.put(origString, i);
            }
            stringDictionaryWrite.close();
            break;
        default:
            throw new RuntimeException("Unhandled type " + spec.getDataType());
    }
}
Also used : Long2IntOpenHashMap(it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap) Double2IntOpenHashMap(it.unimi.dsi.fastutil.doubles.Double2IntOpenHashMap) Int2IntOpenHashMap(it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap) HashMap(java.util.HashMap) Long2IntOpenHashMap(it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap) Object2IntOpenHashMap(it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap) Float2IntOpenHashMap(it.unimi.dsi.fastutil.floats.Float2IntOpenHashMap) FixedByteSingleValueMultiColWriter(com.linkedin.pinot.core.io.writer.impl.FixedByteSingleValueMultiColWriter) Int2IntOpenHashMap(it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap) Float2IntOpenHashMap(it.unimi.dsi.fastutil.floats.Float2IntOpenHashMap) Double2IntOpenHashMap(it.unimi.dsi.fastutil.doubles.Double2IntOpenHashMap)

Aggregations

FixedByteSingleValueMultiColWriter (com.linkedin.pinot.core.io.writer.impl.FixedByteSingleValueMultiColWriter)11 FixedByteSingleValueMultiColReader (com.linkedin.pinot.core.io.reader.impl.FixedByteSingleValueMultiColReader)9 PinotDataBuffer (com.linkedin.pinot.core.segment.memory.PinotDataBuffer)8 File (java.io.File)8 Test (org.testng.annotations.Test)7 Random (java.util.Random)6 SortedForwardIndexReader (com.linkedin.pinot.core.io.reader.impl.SortedForwardIndexReader)1 SortedValueReaderContext (com.linkedin.pinot.core.io.reader.impl.SortedValueReaderContext)1 Double2IntOpenHashMap (it.unimi.dsi.fastutil.doubles.Double2IntOpenHashMap)1 Float2IntOpenHashMap (it.unimi.dsi.fastutil.floats.Float2IntOpenHashMap)1 Int2IntOpenHashMap (it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap)1 Long2IntOpenHashMap (it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap)1 Object2IntOpenHashMap (it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap)1 DataInputStream (java.io.DataInputStream)1 FileInputStream (java.io.FileInputStream)1 IOException (java.io.IOException)1 HashMap (java.util.HashMap)1