Search in sources :

Example 1 with FixedBitMultiValueWriter

use of com.linkedin.pinot.core.io.writer.impl.v1.FixedBitMultiValueWriter in project pinot by linkedin.

the class ForwardIndexWriterBenchmark method convertRawToForwardIndex.

public static void convertRawToForwardIndex(File rawFile) throws Exception {
    List<String> lines = IOUtils.readLines(new FileReader(rawFile));
    int totalDocs = lines.size();
    int max = Integer.MIN_VALUE;
    int maxNumberOfMultiValues = Integer.MIN_VALUE;
    int totalNumValues = 0;
    int[][] data = new int[totalDocs][];
    for (int i = 0; i < lines.size(); i++) {
        String line = lines.get(i);
        String[] split = line.split(",");
        totalNumValues = totalNumValues + split.length;
        if (split.length > maxNumberOfMultiValues) {
            maxNumberOfMultiValues = split.length;
        }
        data[i] = new int[split.length];
        for (int j = 0; j < split.length; j++) {
            String token = split[j];
            int val = Integer.parseInt(token);
            data[i][j] = val;
            if (val > max) {
                max = val;
            }
        }
    }
    int maxBitsNeeded = (int) Math.ceil(Math.log(max) / Math.log(2));
    int size = 2048;
    int[] offsets = new int[size];
    int bitMapSize = 0;
    File outputFile = new File("output.mv.fwd");
    FixedBitMultiValueWriter fixedBitSkipListSCMVWriter = new FixedBitMultiValueWriter(outputFile, totalDocs, totalNumValues, maxBitsNeeded);
    for (int i = 0; i < totalDocs; i++) {
        fixedBitSkipListSCMVWriter.setIntArray(i, data[i]);
        if (i % size == size - 1) {
            MutableRoaringBitmap rr1 = MutableRoaringBitmap.bitmapOf(offsets);
            ByteArrayOutputStream bos = new ByteArrayOutputStream();
            DataOutputStream dos = new DataOutputStream(bos);
            rr1.serialize(dos);
            dos.close();
            // System.out.println("Chunk " + i / size + " bitmap size:" + bos.size());
            bitMapSize += bos.size();
        } else if (i == totalDocs - 1) {
            MutableRoaringBitmap rr1 = MutableRoaringBitmap.bitmapOf(Arrays.copyOf(offsets, i % size));
            ByteArrayOutputStream bos = new ByteArrayOutputStream();
            DataOutputStream dos = new DataOutputStream(bos);
            rr1.serialize(dos);
            dos.close();
            // System.out.println("Chunk " + i / size + " bitmap size:" + bos.size());
            bitMapSize += bos.size();
        }
    }
    fixedBitSkipListSCMVWriter.close();
    System.out.println("Output file size:" + outputFile.length());
    System.out.println("totalNumberOfDoc\t\t\t:" + totalDocs);
    System.out.println("totalNumberOfValues\t\t\t:" + totalNumValues);
    System.out.println("chunk size\t\t\t\t:" + size);
    System.out.println("Num chunks\t\t\t\t:" + totalDocs / size);
    int numChunks = totalDocs / size + 1;
    int totalBits = (totalNumValues * maxBitsNeeded);
    int dataSizeinBytes = (totalBits + 7) / 8;
    System.out.println("Raw data size with fixed bit encoding\t:" + dataSizeinBytes);
    System.out.println("\nPer encoding size");
    System.out.println();
    System.out.println("size (offset + length)\t\t\t:" + ((totalDocs * (4 + 4)) + dataSizeinBytes));
    System.out.println();
    System.out.println("size (offset only)\t\t\t:" + ((totalDocs * (4)) + dataSizeinBytes));
    System.out.println();
    System.out.println("bitMapSize\t\t\t\t:" + bitMapSize);
    System.out.println("size (with bitmap)\t\t\t:" + (bitMapSize + (numChunks * 4) + dataSizeinBytes));
    System.out.println();
    System.out.println("Custom Bitset\t\t\t\t:" + (totalNumValues + 7) / 8);
    System.out.println("size (with custom bitset)\t\t\t:" + (((totalNumValues + 7) / 8) + (numChunks * 4) + dataSizeinBytes));
}
Also used : FixedBitMultiValueWriter(com.linkedin.pinot.core.io.writer.impl.v1.FixedBitMultiValueWriter) MutableRoaringBitmap(org.roaringbitmap.buffer.MutableRoaringBitmap) DataOutputStream(java.io.DataOutputStream) FileReader(java.io.FileReader) ByteArrayOutputStream(java.io.ByteArrayOutputStream) File(java.io.File)

Example 2 with FixedBitMultiValueWriter

use of com.linkedin.pinot.core.io.writer.impl.v1.FixedBitMultiValueWriter in project pinot by linkedin.

the class FixedBitSkipListSCMVWriterTest method testSingleColMultiValue.

@Test
public void testSingleColMultiValue() throws Exception {
    int maxBits = 2;
    while (maxBits < 32) {
        LOGGER.debug("START test maxBit:" + maxBits);
        File file = new File("test_single_col_multi_value_writer.dat");
        file.delete();
        int rows = 100;
        int[][] data = new int[rows][];
        int maxValue = (int) Math.pow(2, maxBits);
        Random r = new Random();
        int totalNumValues = 0;
        for (int i = 0; i < rows; i++) {
            int numValues = r.nextInt(100) + 1;
            data[i] = new int[numValues];
            for (int j = 0; j < numValues; j++) {
                data[i][j] = r.nextInt(maxValue);
            }
            totalNumValues += numValues;
        }
        FixedBitMultiValueWriter writer = new FixedBitMultiValueWriter(file, rows, totalNumValues, maxBits);
        CustomBitSet bitSet = CustomBitSet.withBitLength(totalNumValues * maxBits);
        int numChunks = writer.getNumChunks();
        int[] chunkOffsets = new int[numChunks];
        int chunkId = 0;
        int offset = 0;
        int index = 0;
        for (int i = 0; i < rows; i++) {
            writer.setIntArray(i, data[i]);
            if (i % writer.getRowsPerChunk() == 0) {
                chunkOffsets[chunkId] = offset;
                chunkId = chunkId + 1;
            }
            offset += data[i].length;
            for (int j = 0; j < data[i].length; j++) {
                int value = data[i][j];
                for (int bitPos = maxBits - 1; bitPos >= 0; bitPos--) {
                    if ((value & (1 << bitPos)) != 0) {
                        bitSet.setBit(index * maxBits + (maxBits - bitPos - 1));
                    }
                }
                index = index + 1;
            }
        }
        writer.close();
        LOGGER.trace("chunkOffsets: {}", Arrays.toString(chunkOffsets));
        //start validating the file
        RandomAccessFile raf = new RandomAccessFile(file, "r");
        Assert.assertEquals(raf.length(), writer.getTotalSize());
        DataInputStream dis = new DataInputStream(new FileInputStream(file));
        for (int i = 0; i < numChunks; i++) {
            Assert.assertEquals(dis.readInt(), chunkOffsets[i]);
        }
        int numBytesForBitmap = (totalNumValues + 7) / 8;
        Assert.assertEquals(writer.getBitsetSize(), numBytesForBitmap);
        byte[] bitsetBytes = new byte[numBytesForBitmap];
        dis.read(bitsetBytes);
        CustomBitSet customBit = CustomBitSet.withByteBuffer(numBytesForBitmap, ByteBuffer.wrap(bitsetBytes));
        offset = 0;
        LOGGER.trace(customBit.toString());
        for (int i = 0; i < rows; i++) {
            Assert.assertTrue(customBit.isBitSet(offset));
            offset += data[i].length;
        }
        byte[] byteArray = bitSet.toByteArray();
        LOGGER.trace("raf.length():" + raf.length());
        LOGGER.trace("getTotalSize:" + writer.getTotalSize());
        LOGGER.trace("getRawDataSize:" + writer.getRawDataSize());
        LOGGER.trace("getBitsetSize:" + writer.getBitsetSize());
        LOGGER.trace("getChunkOffsetHeaderSize:" + writer.getChunkOffsetHeaderSize());
        int dataLength = (int) (writer.getTotalSize() - writer.getChunkOffsetHeaderSize() - numBytesForBitmap);
        byte[] rawData = new byte[dataLength];
        // read the data segment that starts after the header.
        dis.read(rawData);
        Assert.assertEquals(rawData.length, byteArray.length);
        Assert.assertEquals(rawData, byteArray);
        raf.close();
        dis.close();
        file.delete();
        LOGGER.debug("END test maxBit:" + maxBits);
        maxBits = maxBits + 1;
        bitSet.close();
        customBit.close();
    }
}
Also used : FixedBitMultiValueWriter(com.linkedin.pinot.core.io.writer.impl.v1.FixedBitMultiValueWriter) Random(java.util.Random) RandomAccessFile(java.io.RandomAccessFile) DataInputStream(java.io.DataInputStream) RandomAccessFile(java.io.RandomAccessFile) File(java.io.File) CustomBitSet(com.linkedin.pinot.core.util.CustomBitSet) FileInputStream(java.io.FileInputStream) Test(org.testng.annotations.Test)

Aggregations

FixedBitMultiValueWriter (com.linkedin.pinot.core.io.writer.impl.v1.FixedBitMultiValueWriter)2 File (java.io.File)2 CustomBitSet (com.linkedin.pinot.core.util.CustomBitSet)1 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1 DataInputStream (java.io.DataInputStream)1 DataOutputStream (java.io.DataOutputStream)1 FileInputStream (java.io.FileInputStream)1 FileReader (java.io.FileReader)1 RandomAccessFile (java.io.RandomAccessFile)1 Random (java.util.Random)1 MutableRoaringBitmap (org.roaringbitmap.buffer.MutableRoaringBitmap)1 Test (org.testng.annotations.Test)1