Search in sources :

Example 1 with MemPageStore

use of org.apache.parquet.column.page.mem.MemPageStore in project parquet-mr by apache.

the class TestCorruptDeltaByteArrays method testColumnReaderImplWithCorruptPage.

@Test
public void testColumnReaderImplWithCorruptPage() throws Exception {
    ColumnDescriptor column = new ColumnDescriptor(new String[] { "s" }, PrimitiveType.PrimitiveTypeName.BINARY, 0, 0);
    MemPageStore pages = new MemPageStore(0);
    PageWriter memWriter = pages.getPageWriter(column);
    ParquetProperties parquetProps = ParquetProperties.builder().withDictionaryEncoding(false).build();
    // get generic repetition and definition level bytes to use for pages
    ValuesWriter rdValues = parquetProps.newDefinitionLevelWriter(column);
    for (int i = 0; i < 10; i += 1) {
        rdValues.writeInteger(0);
    }
    // use a byte array backed BytesInput because it is reused
    BytesInput rd = BytesInput.from(rdValues.getBytes().toByteArray());
    DeltaByteArrayWriter writer = getDeltaByteArrayWriter();
    String lastValue = null;
    List<String> values = new ArrayList<String>();
    for (int i = 0; i < 10; i += 1) {
        lastValue = str(i);
        writer.writeBytes(Binary.fromString(lastValue));
        values.add(lastValue);
    }
    memWriter.writePage(BytesInput.concat(rd, rd, writer.getBytes()), 10, /* number of values in the page */
    new BinaryStatistics(), rdValues.getEncoding(), rdValues.getEncoding(), writer.getEncoding());
    pages.addRowCount(10);
    // sets previous to new byte[0]
    writer.reset();
    corruptWriter(writer, lastValue);
    for (int i = 10; i < 20; i += 1) {
        String value = str(i);
        writer.writeBytes(Binary.fromString(value));
        values.add(value);
    }
    memWriter.writePage(BytesInput.concat(rd, rd, writer.getBytes()), 10, /* number of values in the page */
    new BinaryStatistics(), rdValues.getEncoding(), rdValues.getEncoding(), writer.getEncoding());
    pages.addRowCount(10);
    final List<String> actualValues = new ArrayList<String>();
    PrimitiveConverter converter = new PrimitiveConverter() {

        @Override
        public void addBinary(Binary value) {
            actualValues.add(value.toStringUsingUTF8());
        }
    };
    ColumnReaderImpl columnReader = new ColumnReaderImpl(column, pages.getPageReader(column), converter, new ParsedVersion("parquet-mr", "1.6.0", "abcd"));
    while (actualValues.size() < columnReader.getTotalValueCount()) {
        columnReader.writeCurrentValueToConverter();
        columnReader.consume();
    }
    Assert.assertEquals(values, actualValues);
}
Also used : BytesInput(org.apache.parquet.bytes.BytesInput) DeltaByteArrayWriter(org.apache.parquet.column.values.deltastrings.DeltaByteArrayWriter) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ArrayList(java.util.ArrayList) ParquetProperties(org.apache.parquet.column.ParquetProperties) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) PrimitiveConverter(org.apache.parquet.io.api.PrimitiveConverter) MemPageStore(org.apache.parquet.column.page.mem.MemPageStore) Binary(org.apache.parquet.io.api.Binary) ValuesWriter(org.apache.parquet.column.values.ValuesWriter) ParsedVersion(org.apache.parquet.VersionParser.ParsedVersion) PageWriter(org.apache.parquet.column.page.PageWriter) Test(org.junit.Test)

Example 2 with MemPageStore

use of org.apache.parquet.column.page.mem.MemPageStore in project parquet-mr by apache.

the class ValidatingColumnWriteStore method testReadUsingRequestedSchemaWithExtraFields.

@Test
public void testReadUsingRequestedSchemaWithExtraFields() {
    MessageType orginalSchema = new MessageType("schema", new PrimitiveType(REQUIRED, INT32, "a"), new PrimitiveType(OPTIONAL, INT32, "b"));
    MessageType schemaWithExtraField = new MessageType("schema", new PrimitiveType(OPTIONAL, INT32, "b"), new PrimitiveType(OPTIONAL, INT32, "a"), new PrimitiveType(OPTIONAL, INT32, "c"));
    MemPageStore memPageStoreForOriginalSchema = new MemPageStore(1);
    MemPageStore memPageStoreForSchemaWithExtraField = new MemPageStore(1);
    SimpleGroupFactory groupFactory = new SimpleGroupFactory(orginalSchema);
    writeGroups(orginalSchema, memPageStoreForOriginalSchema, groupFactory.newGroup().append("a", 1).append("b", 2));
    SimpleGroupFactory groupFactory2 = new SimpleGroupFactory(schemaWithExtraField);
    writeGroups(schemaWithExtraField, memPageStoreForSchemaWithExtraField, groupFactory2.newGroup().append("a", 1).append("b", 2).append("c", 3));
    {
        List<Group> groups = new ArrayList<Group>();
        groups.addAll(readGroups(memPageStoreForOriginalSchema, orginalSchema, schemaWithExtraField, 1));
        groups.addAll(readGroups(memPageStoreForSchemaWithExtraField, schemaWithExtraField, schemaWithExtraField, 1));
        // TODO: add once we have the support for empty projection
        // groups1.addAll(readGroups(memPageStore3, schema3, schema2, 1));
        Object[][] expected = { { 2, 1, null }, { 2, 1, 3 } // { null, null}
        };
        validateGroups(groups, expected);
    }
}
Also used : Group(org.apache.parquet.example.data.Group) PrimitiveType(org.apache.parquet.schema.PrimitiveType) SimpleGroupFactory(org.apache.parquet.example.data.simple.SimpleGroupFactory) ArrayList(java.util.ArrayList) List(java.util.List) MemPageStore(org.apache.parquet.column.page.mem.MemPageStore) MessageType(org.apache.parquet.schema.MessageType) Test(org.junit.Test)

Example 3 with MemPageStore

use of org.apache.parquet.column.page.mem.MemPageStore in project parquet-mr by apache.

the class ValidatingColumnWriteStore method testReadUsingProjectedSchema.

@Test
public void testReadUsingProjectedSchema() {
    MessageType orginalSchema = new MessageType("schema", new PrimitiveType(REQUIRED, INT32, "a"), new PrimitiveType(REQUIRED, INT32, "b"));
    MessageType projectedSchema = new MessageType("schema", new PrimitiveType(OPTIONAL, INT32, "b"));
    MemPageStore store = new MemPageStore(1);
    SimpleGroupFactory groupFactory = new SimpleGroupFactory(orginalSchema);
    writeGroups(orginalSchema, store, groupFactory.newGroup().append("a", 1).append("b", 2));
    {
        List<Group> groups = new ArrayList<Group>();
        groups.addAll(readGroups(store, orginalSchema, projectedSchema, 1));
        Object[][] expected = { { 2 } };
        validateGroups(groups, expected);
    }
}
Also used : Group(org.apache.parquet.example.data.Group) PrimitiveType(org.apache.parquet.schema.PrimitiveType) SimpleGroupFactory(org.apache.parquet.example.data.simple.SimpleGroupFactory) ArrayList(java.util.ArrayList) List(java.util.List) MemPageStore(org.apache.parquet.column.page.mem.MemPageStore) MessageType(org.apache.parquet.schema.MessageType) Test(org.junit.Test)

Example 4 with MemPageStore

use of org.apache.parquet.column.page.mem.MemPageStore in project parquet-mr by apache.

the class ValidatingColumnWriteStore method testEmptyField.

@Test
public void testEmptyField() {
    MemPageStore memPageStore = new MemPageStore(1);
    ColumnWriteStoreV1 columns = newColumnWriteStore(memPageStore);
    MessageColumnIO columnIO = new ColumnIOFactory(true).getColumnIO(schema);
    final RecordConsumer recordWriter = columnIO.getRecordWriter(columns);
    recordWriter.startMessage();
    recordWriter.startField("DocId", 0);
    recordWriter.addLong(0);
    recordWriter.endField("DocId", 0);
    recordWriter.startField("Links", 1);
    try {
        recordWriter.endField("Links", 1);
        Assert.fail("expected exception because of empty field");
    } catch (ParquetEncodingException e) {
        Assert.assertEquals("empty fields are illegal, the field should be ommited completely instead", e.getMessage());
    }
}
Also used : ColumnWriteStoreV1(org.apache.parquet.column.impl.ColumnWriteStoreV1) MemPageStore(org.apache.parquet.column.page.mem.MemPageStore) RecordConsumer(org.apache.parquet.io.api.RecordConsumer) Test(org.junit.Test)

Example 5 with MemPageStore

use of org.apache.parquet.column.page.mem.MemPageStore in project parquet-mr by apache.

the class TestFiltered method writeTestRecords.

private MemPageStore writeTestRecords(MessageColumnIO columnIO, int number) {
    MemPageStore memPageStore = new MemPageStore(number * 2);
    ColumnWriteStoreV1 columns = new ColumnWriteStoreV1(memPageStore, ParquetProperties.builder().withPageSize(800).withDictionaryEncoding(false).build());
    RecordConsumer recordWriter = columnIO.getRecordWriter(columns);
    GroupWriter groupWriter = new GroupWriter(recordWriter, schema);
    for (int i = 0; i < number; i++) {
        groupWriter.write(r1);
        groupWriter.write(r2);
    }
    recordWriter.flush();
    columns.flush();
    return memPageStore;
}
Also used : ColumnWriteStoreV1(org.apache.parquet.column.impl.ColumnWriteStoreV1) MemPageStore(org.apache.parquet.column.page.mem.MemPageStore) RecordConsumer(org.apache.parquet.io.api.RecordConsumer) GroupWriter(org.apache.parquet.example.data.GroupWriter)

Aggregations

MemPageStore (org.apache.parquet.column.page.mem.MemPageStore)26 Test (org.junit.Test)21 Group (org.apache.parquet.example.data.Group)12 ColumnWriteStoreV1 (org.apache.parquet.column.impl.ColumnWriteStoreV1)11 MessageType (org.apache.parquet.schema.MessageType)10 GroupRecordConverter (org.apache.parquet.example.data.simple.convert.GroupRecordConverter)8 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)6 RecordConsumer (org.apache.parquet.io.api.RecordConsumer)6 ArrayList (java.util.ArrayList)4 ColumnReader (org.apache.parquet.column.ColumnReader)4 ColumnWriter (org.apache.parquet.column.ColumnWriter)4 GroupWriter (org.apache.parquet.example.data.GroupWriter)4 SimpleGroupFactory (org.apache.parquet.example.data.simple.SimpleGroupFactory)4 PrimitiveType (org.apache.parquet.schema.PrimitiveType)4 List (java.util.List)3 PageWriter (org.apache.parquet.column.page.PageWriter)2 ParsedVersion (org.apache.parquet.VersionParser.ParsedVersion)1 BytesInput (org.apache.parquet.bytes.BytesInput)1 ParquetProperties (org.apache.parquet.column.ParquetProperties)1 DataPage (org.apache.parquet.column.page.DataPage)1