use of org.apache.parquet.bytes.BytesInput in project parquet-mr by apache.
the class TestDictionary method getBytesAndCheckEncoding.
private BytesInput getBytesAndCheckEncoding(ValuesWriter cw, Encoding encoding) throws IOException {
BytesInput bytes = BytesInput.copy(cw.getBytes());
assertEquals(encoding, cw.getEncoding());
cw.reset();
return bytes;
}
use of org.apache.parquet.bytes.BytesInput in project parquet-mr by apache.
the class TestDictionary method testDoubleDictionary.
@Test
public void testDoubleDictionary() throws IOException {
int COUNT = 1000;
int COUNT2 = 2000;
final FallbackValuesWriter<PlainDoubleDictionaryValuesWriter, PlainValuesWriter> cw = newPlainDoubleDictionaryValuesWriter(10000, 10000);
for (double i = 0; i < COUNT; i++) {
cw.writeDouble(i % 50);
}
BytesInput bytes1 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
assertEquals(50, cw.initialWriter.getDictionarySize());
for (double i = COUNT2; i > 0; i--) {
cw.writeDouble(i % 50);
}
BytesInput bytes2 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
assertEquals(50, cw.initialWriter.getDictionarySize());
final DictionaryValuesReader cr = initDicReader(cw, DOUBLE);
cr.initFromPage(COUNT, bytes1.toInputStream());
for (double i = 0; i < COUNT; i++) {
double back = cr.readDouble();
assertEquals(i % 50, back, 0.0);
}
cr.initFromPage(COUNT2, bytes2.toInputStream());
for (double i = COUNT2; i > 0; i--) {
double back = cr.readDouble();
assertEquals(i % 50, back, 0.0);
}
}
use of org.apache.parquet.bytes.BytesInput in project parquet-mr by apache.
the class ColumnReaderImpl method readPageV1.
private void readPageV1(DataPageV1 page) {
ValuesReader rlReader = page.getRlEncoding().getValuesReader(path, REPETITION_LEVEL);
ValuesReader dlReader = page.getDlEncoding().getValuesReader(path, DEFINITION_LEVEL);
this.repetitionLevelColumn = new ValuesReaderIntIterator(rlReader);
this.definitionLevelColumn = new ValuesReaderIntIterator(dlReader);
try {
BytesInput bytes = page.getBytes();
LOG.debug("page size {} bytes and {} records", bytes.size(), pageValueCount);
LOG.debug("reading repetition levels at 0");
ByteBufferInputStream in = bytes.toInputStream();
rlReader.initFromPage(pageValueCount, in);
LOG.debug("reading definition levels at {}", in.position());
dlReader.initFromPage(pageValueCount, in);
LOG.debug("reading data at {}", in.position());
initDataReader(page.getValueEncoding(), in, page.getValueCount());
} catch (IOException e) {
throw new ParquetDecodingException("could not read page " + page + " in col " + path, e);
}
}
use of org.apache.parquet.bytes.BytesInput in project parquet-mr by apache.
the class TestCorruptDeltaByteArrays method testColumnReaderImplWithCorruptPage.
@Test
public void testColumnReaderImplWithCorruptPage() throws Exception {
ColumnDescriptor column = new ColumnDescriptor(new String[] { "s" }, PrimitiveType.PrimitiveTypeName.BINARY, 0, 0);
MemPageStore pages = new MemPageStore(0);
PageWriter memWriter = pages.getPageWriter(column);
ParquetProperties parquetProps = ParquetProperties.builder().withDictionaryEncoding(false).build();
// get generic repetition and definition level bytes to use for pages
ValuesWriter rdValues = parquetProps.newDefinitionLevelWriter(column);
for (int i = 0; i < 10; i += 1) {
rdValues.writeInteger(0);
}
// use a byte array backed BytesInput because it is reused
BytesInput rd = BytesInput.from(rdValues.getBytes().toByteArray());
DeltaByteArrayWriter writer = getDeltaByteArrayWriter();
String lastValue = null;
List<String> values = new ArrayList<String>();
for (int i = 0; i < 10; i += 1) {
lastValue = str(i);
writer.writeBytes(Binary.fromString(lastValue));
values.add(lastValue);
}
memWriter.writePage(BytesInput.concat(rd, rd, writer.getBytes()), 10, /* number of values in the page */
new BinaryStatistics(), rdValues.getEncoding(), rdValues.getEncoding(), writer.getEncoding());
pages.addRowCount(10);
// sets previous to new byte[0]
writer.reset();
corruptWriter(writer, lastValue);
for (int i = 10; i < 20; i += 1) {
String value = str(i);
writer.writeBytes(Binary.fromString(value));
values.add(value);
}
memWriter.writePage(BytesInput.concat(rd, rd, writer.getBytes()), 10, /* number of values in the page */
new BinaryStatistics(), rdValues.getEncoding(), rdValues.getEncoding(), writer.getEncoding());
pages.addRowCount(10);
final List<String> actualValues = new ArrayList<String>();
PrimitiveConverter converter = new PrimitiveConverter() {
@Override
public void addBinary(Binary value) {
actualValues.add(value.toStringUsingUTF8());
}
};
ColumnReaderImpl columnReader = new ColumnReaderImpl(column, pages.getPageReader(column), converter, new ParsedVersion("parquet-mr", "1.6.0", "abcd"));
while (actualValues.size() < columnReader.getTotalValueCount()) {
columnReader.writeCurrentValueToConverter();
columnReader.consume();
}
Assert.assertEquals(values, actualValues);
}
use of org.apache.parquet.bytes.BytesInput in project parquet-mr by apache.
the class TestColumnChunkPageWriteStore method testColumnOrderV1.
@Test
public void testColumnOrderV1() throws IOException {
ParquetFileWriter mockFileWriter = Mockito.mock(ParquetFileWriter.class);
InOrder inOrder = inOrder(mockFileWriter);
MessageType schema = Types.buildMessage().required(BINARY).as(UTF8).named("a_string").required(INT32).named("an_int").required(INT64).named("a_long").required(FLOAT).named("a_float").required(DOUBLE).named("a_double").named("order_test");
BytesInput fakeData = BytesInput.fromInt(34);
int fakeCount = 3;
BinaryStatistics fakeStats = new BinaryStatistics();
// TODO - look back at this, an allocator was being passed here in the ByteBuffer changes
// see comment at this constructor
ColumnChunkPageWriteStore store = new ColumnChunkPageWriteStore(compressor(UNCOMPRESSED), schema, new HeapByteBufferAllocator());
for (ColumnDescriptor col : schema.getColumns()) {
PageWriter pageWriter = store.getPageWriter(col);
pageWriter.writePage(fakeData, fakeCount, fakeStats, RLE, RLE, PLAIN);
}
// flush to the mock writer
store.flushToFileWriter(mockFileWriter);
for (ColumnDescriptor col : schema.getColumns()) {
inOrder.verify(mockFileWriter).startColumn(eq(col), eq((long) fakeCount), eq(UNCOMPRESSED));
}
}
Aggregations