Search in sources :

Example 21 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project parquet-mr by apache.

the class CheckParquet251Command method check.

private String check(String file) throws IOException {
    Path path = qualifiedPath(file);
    ParquetMetadata footer = ParquetFileReader.readFooter(getConf(), path, ParquetMetadataConverter.NO_FILTER);
    FileMetaData meta = footer.getFileMetaData();
    String createdBy = meta.getCreatedBy();
    if (CorruptStatistics.shouldIgnoreStatistics(createdBy, BINARY)) {
        // create fake metadata that will read corrupt stats and return them
        FileMetaData fakeMeta = new FileMetaData(meta.getSchema(), meta.getKeyValueMetaData(), Version.FULL_VERSION);
        // get just the binary columns
        List<ColumnDescriptor> columns = Lists.newArrayList();
        Iterables.addAll(columns, Iterables.filter(meta.getSchema().getColumns(), new Predicate<ColumnDescriptor>() {

            @Override
            public boolean apply(@Nullable ColumnDescriptor input) {
                return input != null && input.getType() == BINARY;
            }
        }));
        // now check to see if the data is actually corrupt
        ParquetFileReader reader = new ParquetFileReader(getConf(), fakeMeta, path, footer.getBlocks(), columns);
        try {
            PageStatsValidator validator = new PageStatsValidator();
            for (PageReadStore pages = reader.readNextRowGroup(); pages != null; pages = reader.readNextRowGroup()) {
                validator.validate(columns, pages);
            }
        } catch (BadStatsException e) {
            return e.getMessage();
        }
    }
    return null;
}
Also used : Path(org.apache.hadoop.fs.Path) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) PageReadStore(org.apache.parquet.column.page.PageReadStore) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData) Nullable(javax.annotation.Nullable) Predicate(com.google.common.base.Predicate)

Example 22 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project parquet-mr by apache.

the class TestMemColumn method testMemColumn.

@Test
public void testMemColumn() throws Exception {
    MessageType schema = MessageTypeParser.parseMessageType("message msg { required group foo { required int64 bar; } }");
    ColumnDescriptor path = schema.getColumnDescription(new String[] { "foo", "bar" });
    MemPageStore memPageStore = new MemPageStore(10);
    ColumnWriteStoreV1 memColumnsStore = newColumnWriteStoreImpl(memPageStore);
    ColumnWriter columnWriter = memColumnsStore.getColumnWriter(path);
    columnWriter.write(42l, 0, 0);
    memColumnsStore.flush();
    ColumnReader columnReader = getColumnReader(memPageStore, path, schema);
    for (int i = 0; i < columnReader.getTotalValueCount(); i++) {
        assertEquals(columnReader.getCurrentRepetitionLevel(), 0);
        assertEquals(columnReader.getCurrentDefinitionLevel(), 0);
        assertEquals(columnReader.getLong(), 42);
        columnReader.consume();
    }
}
Also used : ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ColumnWriteStoreV1(org.apache.parquet.column.impl.ColumnWriteStoreV1) MemPageStore(org.apache.parquet.column.page.mem.MemPageStore) ColumnReader(org.apache.parquet.column.ColumnReader) ColumnWriter(org.apache.parquet.column.ColumnWriter) MessageType(org.apache.parquet.schema.MessageType) Test(org.junit.Test)

Example 23 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project parquet-mr by apache.

the class TestMemColumn method testMemColumnSeveralPages.

@Test
public void testMemColumnSeveralPages() throws Exception {
    MessageType mt = MessageTypeParser.parseMessageType("message msg { required group foo { required int64 bar; } }");
    String[] col = new String[] { "foo", "bar" };
    MemPageStore memPageStore = new MemPageStore(10);
    ColumnWriteStoreV1 memColumnsStore = newColumnWriteStoreImpl(memPageStore);
    ColumnDescriptor path1 = mt.getColumnDescription(col);
    ColumnDescriptor path = path1;
    ColumnWriter columnWriter = memColumnsStore.getColumnWriter(path);
    for (int i = 0; i < 2000; i++) {
        columnWriter.write(42l, 0, 0);
    }
    memColumnsStore.flush();
    ColumnReader columnReader = getColumnReader(memPageStore, path, mt);
    for (int i = 0; i < columnReader.getTotalValueCount(); i++) {
        assertEquals(columnReader.getCurrentRepetitionLevel(), 0);
        assertEquals(columnReader.getCurrentDefinitionLevel(), 0);
        assertEquals(columnReader.getLong(), 42);
        columnReader.consume();
    }
}
Also used : ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ColumnWriteStoreV1(org.apache.parquet.column.impl.ColumnWriteStoreV1) MemPageStore(org.apache.parquet.column.page.mem.MemPageStore) ColumnReader(org.apache.parquet.column.ColumnReader) ColumnWriter(org.apache.parquet.column.ColumnWriter) MessageType(org.apache.parquet.schema.MessageType) Test(org.junit.Test)

Example 24 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project parquet-mr by apache.

the class TestMemColumn method testMemColumnSeveralPagesRepeated.

@Test
public void testMemColumnSeveralPagesRepeated() throws Exception {
    MessageType mt = MessageTypeParser.parseMessageType("message msg { repeated group foo { repeated int64 bar; } }");
    String[] col = new String[] { "foo", "bar" };
    MemPageStore memPageStore = new MemPageStore(10);
    ColumnWriteStoreV1 memColumnsStore = newColumnWriteStoreImpl(memPageStore);
    ColumnDescriptor path1 = mt.getColumnDescription(col);
    ColumnDescriptor path = path1;
    ColumnWriter columnWriter = memColumnsStore.getColumnWriter(path);
    int[] rs = { 0, 0, 0, 1, 1, 1, 2, 2, 2 };
    int[] ds = { 0, 1, 2, 0, 1, 2, 0, 1, 2 };
    for (int i = 0; i < 837; i++) {
        int r = rs[i % rs.length];
        int d = ds[i % ds.length];
        LOG.debug("write i: {}", i);
        if (d == 2) {
            columnWriter.write((long) i, r, d);
        } else {
            columnWriter.writeNull(r, d);
        }
    }
    memColumnsStore.flush();
    ColumnReader columnReader = getColumnReader(memPageStore, path, mt);
    int i = 0;
    for (int j = 0; j < columnReader.getTotalValueCount(); j++) {
        int r = rs[i % rs.length];
        int d = ds[i % ds.length];
        LOG.debug("read i: {}", i);
        assertEquals("r row " + i, r, columnReader.getCurrentRepetitionLevel());
        assertEquals("d row " + i, d, columnReader.getCurrentDefinitionLevel());
        if (d == 2) {
            assertEquals("data row " + i, (long) i, columnReader.getLong());
        }
        columnReader.consume();
        ++i;
    }
}
Also used : ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ColumnWriteStoreV1(org.apache.parquet.column.impl.ColumnWriteStoreV1) MemPageStore(org.apache.parquet.column.page.mem.MemPageStore) ColumnReader(org.apache.parquet.column.ColumnReader) ColumnWriter(org.apache.parquet.column.ColumnWriter) MessageType(org.apache.parquet.schema.MessageType) Test(org.junit.Test)

Example 25 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project parquet-mr by apache.

the class ParquetFileWriter method appendRowGroup.

public void appendRowGroup(SeekableInputStream from, BlockMetaData rowGroup, boolean dropColumns) throws IOException {
    startBlock(rowGroup.getRowCount());
    Map<String, ColumnChunkMetaData> columnsToCopy = new HashMap<String, ColumnChunkMetaData>();
    for (ColumnChunkMetaData chunk : rowGroup.getColumns()) {
        columnsToCopy.put(chunk.getPath().toDotString(), chunk);
    }
    List<ColumnChunkMetaData> columnsInOrder = new ArrayList<ColumnChunkMetaData>();
    for (ColumnDescriptor descriptor : schema.getColumns()) {
        String path = ColumnPath.get(descriptor.getPath()).toDotString();
        ColumnChunkMetaData chunk = columnsToCopy.remove(path);
        if (chunk != null) {
            columnsInOrder.add(chunk);
        } else {
            throw new IllegalArgumentException(String.format("Missing column '%s', cannot copy row group: %s", path, rowGroup));
        }
    }
    // complain if some columns would be dropped and that's not okay
    if (!dropColumns && !columnsToCopy.isEmpty()) {
        throw new IllegalArgumentException(String.format("Columns cannot be copied (missing from target schema): %s", Strings.join(columnsToCopy.keySet(), ", ")));
    }
    // copy the data for all chunks
    long start = -1;
    long length = 0;
    long blockCompressedSize = 0;
    for (int i = 0; i < columnsInOrder.size(); i += 1) {
        ColumnChunkMetaData chunk = columnsInOrder.get(i);
        // get this chunk's start position in the new file
        long newChunkStart = out.getPos() + length;
        // add this chunk to be copied with any previous chunks
        if (start < 0) {
            // no previous chunk included, start at this chunk's starting pos
            start = chunk.getStartingPos();
        }
        length += chunk.getTotalSize();
        if ((i + 1) == columnsInOrder.size() || columnsInOrder.get(i + 1).getStartingPos() != (start + length)) {
            // not contiguous. do the copy now.
            copy(from, out, start, length);
            // reset to start at the next column chunk
            start = -1;
            length = 0;
        }
        currentBlock.addColumn(ColumnChunkMetaData.get(chunk.getPath(), chunk.getPrimitiveType(), chunk.getCodec(), chunk.getEncodingStats(), chunk.getEncodings(), chunk.getStatistics(), newChunkStart, newChunkStart, chunk.getValueCount(), chunk.getTotalSize(), chunk.getTotalUncompressedSize()));
        blockCompressedSize += chunk.getTotalSize();
    }
    currentBlock.setTotalByteSize(blockCompressedSize);
    endBlock();
}
Also used : ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) HashMap(java.util.HashMap) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ArrayList(java.util.ArrayList)

Aggregations

ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)88 MessageType (org.apache.parquet.schema.MessageType)33 PrimitiveType (org.apache.parquet.schema.PrimitiveType)18 Test (org.testng.annotations.Test)18 RichColumnDescriptor (com.facebook.presto.parquet.RichColumnDescriptor)16 ArrayList (java.util.ArrayList)16 GroupType (org.apache.parquet.schema.GroupType)14 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)12 Test (org.junit.Test)12 Domain (com.facebook.presto.common.predicate.Domain)11 TupleDomain (com.facebook.presto.common.predicate.TupleDomain)11 Path (org.apache.hadoop.fs.Path)11 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)11 List (java.util.List)10 ImmutableList (com.google.common.collect.ImmutableList)9 HashMap (java.util.HashMap)9 Configuration (org.apache.hadoop.conf.Configuration)9 Type (org.apache.parquet.schema.Type)9 HiveColumnHandle (com.facebook.presto.hive.HiveColumnHandle)8 IOException (java.io.IOException)7