use of com.linkedin.pinot.core.io.writer.impl.v1.FixedBitMultiValueWriter in project pinot by linkedin.
the class ForwardIndexWriterBenchmark method convertRawToForwardIndex.
public static void convertRawToForwardIndex(File rawFile) throws Exception {
List<String> lines = IOUtils.readLines(new FileReader(rawFile));
int totalDocs = lines.size();
int max = Integer.MIN_VALUE;
int maxNumberOfMultiValues = Integer.MIN_VALUE;
int totalNumValues = 0;
int[][] data = new int[totalDocs][];
for (int i = 0; i < lines.size(); i++) {
String line = lines.get(i);
String[] split = line.split(",");
totalNumValues = totalNumValues + split.length;
if (split.length > maxNumberOfMultiValues) {
maxNumberOfMultiValues = split.length;
}
data[i] = new int[split.length];
for (int j = 0; j < split.length; j++) {
String token = split[j];
int val = Integer.parseInt(token);
data[i][j] = val;
if (val > max) {
max = val;
}
}
}
int maxBitsNeeded = (int) Math.ceil(Math.log(max) / Math.log(2));
int size = 2048;
int[] offsets = new int[size];
int bitMapSize = 0;
File outputFile = new File("output.mv.fwd");
FixedBitMultiValueWriter fixedBitSkipListSCMVWriter = new FixedBitMultiValueWriter(outputFile, totalDocs, totalNumValues, maxBitsNeeded);
for (int i = 0; i < totalDocs; i++) {
fixedBitSkipListSCMVWriter.setIntArray(i, data[i]);
if (i % size == size - 1) {
MutableRoaringBitmap rr1 = MutableRoaringBitmap.bitmapOf(offsets);
ByteArrayOutputStream bos = new ByteArrayOutputStream();
DataOutputStream dos = new DataOutputStream(bos);
rr1.serialize(dos);
dos.close();
// System.out.println("Chunk " + i / size + " bitmap size:" + bos.size());
bitMapSize += bos.size();
} else if (i == totalDocs - 1) {
MutableRoaringBitmap rr1 = MutableRoaringBitmap.bitmapOf(Arrays.copyOf(offsets, i % size));
ByteArrayOutputStream bos = new ByteArrayOutputStream();
DataOutputStream dos = new DataOutputStream(bos);
rr1.serialize(dos);
dos.close();
// System.out.println("Chunk " + i / size + " bitmap size:" + bos.size());
bitMapSize += bos.size();
}
}
fixedBitSkipListSCMVWriter.close();
System.out.println("Output file size:" + outputFile.length());
System.out.println("totalNumberOfDoc\t\t\t:" + totalDocs);
System.out.println("totalNumberOfValues\t\t\t:" + totalNumValues);
System.out.println("chunk size\t\t\t\t:" + size);
System.out.println("Num chunks\t\t\t\t:" + totalDocs / size);
int numChunks = totalDocs / size + 1;
int totalBits = (totalNumValues * maxBitsNeeded);
int dataSizeinBytes = (totalBits + 7) / 8;
System.out.println("Raw data size with fixed bit encoding\t:" + dataSizeinBytes);
System.out.println("\nPer encoding size");
System.out.println();
System.out.println("size (offset + length)\t\t\t:" + ((totalDocs * (4 + 4)) + dataSizeinBytes));
System.out.println();
System.out.println("size (offset only)\t\t\t:" + ((totalDocs * (4)) + dataSizeinBytes));
System.out.println();
System.out.println("bitMapSize\t\t\t\t:" + bitMapSize);
System.out.println("size (with bitmap)\t\t\t:" + (bitMapSize + (numChunks * 4) + dataSizeinBytes));
System.out.println();
System.out.println("Custom Bitset\t\t\t\t:" + (totalNumValues + 7) / 8);
System.out.println("size (with custom bitset)\t\t\t:" + (((totalNumValues + 7) / 8) + (numChunks * 4) + dataSizeinBytes));
}
use of com.linkedin.pinot.core.io.writer.impl.v1.FixedBitMultiValueWriter in project pinot by linkedin.
the class FixedBitSkipListSCMVWriterTest method testSingleColMultiValue.
@Test
public void testSingleColMultiValue() throws Exception {
int maxBits = 2;
while (maxBits < 32) {
LOGGER.debug("START test maxBit:" + maxBits);
File file = new File("test_single_col_multi_value_writer.dat");
file.delete();
int rows = 100;
int[][] data = new int[rows][];
int maxValue = (int) Math.pow(2, maxBits);
Random r = new Random();
int totalNumValues = 0;
for (int i = 0; i < rows; i++) {
int numValues = r.nextInt(100) + 1;
data[i] = new int[numValues];
for (int j = 0; j < numValues; j++) {
data[i][j] = r.nextInt(maxValue);
}
totalNumValues += numValues;
}
FixedBitMultiValueWriter writer = new FixedBitMultiValueWriter(file, rows, totalNumValues, maxBits);
CustomBitSet bitSet = CustomBitSet.withBitLength(totalNumValues * maxBits);
int numChunks = writer.getNumChunks();
int[] chunkOffsets = new int[numChunks];
int chunkId = 0;
int offset = 0;
int index = 0;
for (int i = 0; i < rows; i++) {
writer.setIntArray(i, data[i]);
if (i % writer.getRowsPerChunk() == 0) {
chunkOffsets[chunkId] = offset;
chunkId = chunkId + 1;
}
offset += data[i].length;
for (int j = 0; j < data[i].length; j++) {
int value = data[i][j];
for (int bitPos = maxBits - 1; bitPos >= 0; bitPos--) {
if ((value & (1 << bitPos)) != 0) {
bitSet.setBit(index * maxBits + (maxBits - bitPos - 1));
}
}
index = index + 1;
}
}
writer.close();
LOGGER.trace("chunkOffsets: {}", Arrays.toString(chunkOffsets));
//start validating the file
RandomAccessFile raf = new RandomAccessFile(file, "r");
Assert.assertEquals(raf.length(), writer.getTotalSize());
DataInputStream dis = new DataInputStream(new FileInputStream(file));
for (int i = 0; i < numChunks; i++) {
Assert.assertEquals(dis.readInt(), chunkOffsets[i]);
}
int numBytesForBitmap = (totalNumValues + 7) / 8;
Assert.assertEquals(writer.getBitsetSize(), numBytesForBitmap);
byte[] bitsetBytes = new byte[numBytesForBitmap];
dis.read(bitsetBytes);
CustomBitSet customBit = CustomBitSet.withByteBuffer(numBytesForBitmap, ByteBuffer.wrap(bitsetBytes));
offset = 0;
LOGGER.trace(customBit.toString());
for (int i = 0; i < rows; i++) {
Assert.assertTrue(customBit.isBitSet(offset));
offset += data[i].length;
}
byte[] byteArray = bitSet.toByteArray();
LOGGER.trace("raf.length():" + raf.length());
LOGGER.trace("getTotalSize:" + writer.getTotalSize());
LOGGER.trace("getRawDataSize:" + writer.getRawDataSize());
LOGGER.trace("getBitsetSize:" + writer.getBitsetSize());
LOGGER.trace("getChunkOffsetHeaderSize:" + writer.getChunkOffsetHeaderSize());
int dataLength = (int) (writer.getTotalSize() - writer.getChunkOffsetHeaderSize() - numBytesForBitmap);
byte[] rawData = new byte[dataLength];
// read the data segment that starts after the header.
dis.read(rawData);
Assert.assertEquals(rawData.length, byteArray.length);
Assert.assertEquals(rawData, byteArray);
raf.close();
dis.close();
file.delete();
LOGGER.debug("END test maxBit:" + maxBits);
maxBits = maxBits + 1;
bitSet.close();
customBit.close();
}
}
Aggregations