Search in sources :

Example 6 with ColumnReader

use of org.apache.parquet.column.ColumnReader in project parquet-mr by apache.

the class ColumnRecordFilter method column.

/**
 * Factory method for record filter which applies the supplied predicate to the specified column.
 * Note that if searching for a repeated sub-attribute it will only ever match against the
 * first instance of it in the object.
 *
 * @param columnPath Dot separated path specifier, e.g. "engine.capacity"
 * @param predicate Should call getBinary etc. and check the value
 * @return a column filter
 */
public static final UnboundRecordFilter column(final String columnPath, final ColumnPredicates.Predicate predicate) {
    checkNotNull(columnPath, "columnPath");
    checkNotNull(predicate, "predicate");
    return new UnboundRecordFilter() {

        final String[] filterPath = columnPath.split("\\.");

        @Override
        public RecordFilter bind(Iterable<ColumnReader> readers) {
            for (ColumnReader reader : readers) {
                if (Arrays.equals(reader.getDescriptor().getPath(), filterPath)) {
                    return new ColumnRecordFilter(reader, predicate);
                }
            }
            throw new IllegalArgumentException("Column " + columnPath + " does not exist.");
        }
    };
}
Also used : ColumnReader(org.apache.parquet.column.ColumnReader)

Example 7 with ColumnReader

use of org.apache.parquet.column.ColumnReader in project parquet-mr by apache.

the class FilteredRecordReader method skipToMatch.

/**
 * Skips forwards until the filter finds the first match. Returns false
 * if none found.
 */
private void skipToMatch() {
    while (recordsRead < recordCount && !recordFilter.isMatch()) {
        State currentState = getState(0);
        do {
            ColumnReader columnReader = currentState.column;
            // set the current value
            if (columnReader.getCurrentDefinitionLevel() >= currentState.maxDefinitionLevel) {
                columnReader.skip();
            }
            columnReader.consume();
            // Based on repetition level work out next state to go to
            int nextR = currentState.maxRepetitionLevel == 0 ? 0 : columnReader.getCurrentRepetitionLevel();
            currentState = currentState.getNextState(nextR);
        } while (currentState != null);
        ++recordsRead;
    }
}
Also used : ColumnReader(org.apache.parquet.column.ColumnReader)

Example 8 with ColumnReader

use of org.apache.parquet.column.ColumnReader in project parquet-mr by apache.

the class TestColumnReaderImpl method testOptional.

@Test
public void testOptional() throws Exception {
    MessageType schema = MessageTypeParser.parseMessageType("message test { optional binary foo; }");
    ColumnDescriptor col = schema.getColumns().get(0);
    MemPageWriter pageWriter = new MemPageWriter();
    ColumnWriterV2 columnWriterV2 = new ColumnWriterV2(col, pageWriter, ParquetProperties.builder().withDictionaryPageSize(1024).withWriterVersion(PARQUET_2_0).withPageSize(2048).build());
    for (int i = 0; i < rows; i++) {
        columnWriterV2.writeNull(0, 0);
        if ((i + 1) % 1000 == 0) {
            columnWriterV2.writePage(i);
        }
    }
    columnWriterV2.writePage(rows);
    columnWriterV2.finalizeColumnChunk();
    List<DataPage> pages = pageWriter.getPages();
    int valueCount = 0;
    int rowCount = 0;
    for (DataPage dataPage : pages) {
        valueCount += dataPage.getValueCount();
        rowCount += ((DataPageV2) dataPage).getRowCount();
    }
    assertEquals(rows, rowCount);
    assertEquals(rows, valueCount);
    MemPageReader pageReader = new MemPageReader((long) rows, pages.iterator(), pageWriter.getDictionaryPage());
    ValidatingConverter converter = new ValidatingConverter();
    ColumnReader columnReader = new ColumnReaderImpl(col, pageReader, converter, VersionParser.parse(Version.FULL_VERSION));
    for (int i = 0; i < rows; i++) {
        assertEquals(0, columnReader.getCurrentRepetitionLevel());
        assertEquals(0, columnReader.getCurrentDefinitionLevel());
        columnReader.consume();
    }
    assertEquals(0, converter.count);
}
Also used : DataPage(org.apache.parquet.column.page.DataPage) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) MemPageWriter(org.apache.parquet.column.page.mem.MemPageWriter) ColumnReader(org.apache.parquet.column.ColumnReader) MessageType(org.apache.parquet.schema.MessageType) MemPageReader(org.apache.parquet.column.page.mem.MemPageReader) Test(org.junit.Test)

Example 9 with ColumnReader

use of org.apache.parquet.column.ColumnReader in project parquet-mr by apache.

the class TestColumnReaderImpl method test.

@Test
public void test() throws Exception {
    MessageType schema = MessageTypeParser.parseMessageType("message test { required binary foo; }");
    ColumnDescriptor col = schema.getColumns().get(0);
    MemPageWriter pageWriter = new MemPageWriter();
    ColumnWriterV2 columnWriterV2 = new ColumnWriterV2(col, pageWriter, ParquetProperties.builder().withDictionaryPageSize(1024).withWriterVersion(PARQUET_2_0).withPageSize(2048).build());
    for (int i = 0; i < rows; i++) {
        columnWriterV2.write(Binary.fromString("bar" + i % 10), 0, 0);
        if ((i + 1) % 1000 == 0) {
            columnWriterV2.writePage(i);
        }
    }
    columnWriterV2.writePage(rows);
    columnWriterV2.finalizeColumnChunk();
    List<DataPage> pages = pageWriter.getPages();
    int valueCount = 0;
    int rowCount = 0;
    for (DataPage dataPage : pages) {
        valueCount += dataPage.getValueCount();
        rowCount += ((DataPageV2) dataPage).getRowCount();
    }
    assertEquals(rows, rowCount);
    assertEquals(rows, valueCount);
    MemPageReader pageReader = new MemPageReader((long) rows, pages.iterator(), pageWriter.getDictionaryPage());
    ValidatingConverter converter = new ValidatingConverter();
    ColumnReader columnReader = new ColumnReaderImpl(col, pageReader, converter, VersionParser.parse(Version.FULL_VERSION));
    for (int i = 0; i < rows; i++) {
        assertEquals(0, columnReader.getCurrentRepetitionLevel());
        assertEquals(0, columnReader.getCurrentDefinitionLevel());
        columnReader.writeCurrentValueToConverter();
        columnReader.consume();
    }
    assertEquals(rows, converter.count);
}
Also used : DataPage(org.apache.parquet.column.page.DataPage) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) MemPageWriter(org.apache.parquet.column.page.mem.MemPageWriter) ColumnReader(org.apache.parquet.column.ColumnReader) MessageType(org.apache.parquet.schema.MessageType) MemPageReader(org.apache.parquet.column.page.mem.MemPageReader) Test(org.junit.Test)

Example 10 with ColumnReader

use of org.apache.parquet.column.ColumnReader in project parquet-mr by apache.

the class TestMemColumn method testMemColumnBinary.

@Test
public void testMemColumnBinary() throws Exception {
    MessageType mt = MessageTypeParser.parseMessageType("message msg { required group foo { required binary bar; } }");
    String[] col = new String[] { "foo", "bar" };
    MemPageStore memPageStore = new MemPageStore(10);
    ColumnWriteStoreV1 memColumnsStore = newColumnWriteStoreImpl(memPageStore);
    ColumnDescriptor path1 = mt.getColumnDescription(col);
    ColumnDescriptor path = path1;
    ColumnWriter columnWriter = memColumnsStore.getColumnWriter(path);
    columnWriter.write(Binary.fromString("42"), 0, 0);
    memColumnsStore.flush();
    ColumnReader columnReader = getColumnReader(memPageStore, path, mt);
    for (int i = 0; i < columnReader.getTotalValueCount(); i++) {
        assertEquals(columnReader.getCurrentRepetitionLevel(), 0);
        assertEquals(columnReader.getCurrentDefinitionLevel(), 0);
        assertEquals(columnReader.getBinary().toStringUsingUTF8(), "42");
        columnReader.consume();
    }
}
Also used : ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ColumnWriteStoreV1(org.apache.parquet.column.impl.ColumnWriteStoreV1) MemPageStore(org.apache.parquet.column.page.mem.MemPageStore) ColumnReader(org.apache.parquet.column.ColumnReader) ColumnWriter(org.apache.parquet.column.ColumnWriter) MessageType(org.apache.parquet.schema.MessageType) Test(org.junit.Test)

Aggregations

ColumnReader (org.apache.parquet.column.ColumnReader)10 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)6 MessageType (org.apache.parquet.schema.MessageType)6 Test (org.junit.Test)6 ColumnWriter (org.apache.parquet.column.ColumnWriter)4 ColumnWriteStoreV1 (org.apache.parquet.column.impl.ColumnWriteStoreV1)4 MemPageStore (org.apache.parquet.column.page.mem.MemPageStore)4 DataPage (org.apache.parquet.column.page.DataPage)2 MemPageReader (org.apache.parquet.column.page.mem.MemPageReader)2 MemPageWriter (org.apache.parquet.column.page.mem.MemPageWriter)2 PrimitiveStringifier (org.apache.parquet.schema.PrimitiveStringifier)1