Search in sources :

Example 61 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project parquet-mr by apache.

the class TestParquetWriterAppendBlocks method testMergedMetadata.

@Test
public void testMergedMetadata() throws IOException {
    Path combinedFile = newTemp();
    ParquetFileWriter writer = new ParquetFileWriter(CONF, FILE_SCHEMA, combinedFile);
    writer.start();
    writer.appendFile(CONF, file1);
    writer.appendFile(CONF, file2);
    writer.end(EMPTY_METADATA);
    ParquetMetadata combinedFooter = ParquetFileReader.readFooter(CONF, combinedFile, NO_FILTER);
    ParquetMetadata f1Footer = ParquetFileReader.readFooter(CONF, file1, NO_FILTER);
    ParquetMetadata f2Footer = ParquetFileReader.readFooter(CONF, file2, NO_FILTER);
    LinkedList<BlockMetaData> expectedRowGroups = new LinkedList<BlockMetaData>();
    expectedRowGroups.addAll(f1Footer.getBlocks());
    expectedRowGroups.addAll(f2Footer.getBlocks());
    Assert.assertEquals("Combined should have the right number of row groups", expectedRowGroups.size(), combinedFooter.getBlocks().size());
    long nextStart = 4;
    for (BlockMetaData rowGroup : combinedFooter.getBlocks()) {
        BlockMetaData expected = expectedRowGroups.removeFirst();
        Assert.assertEquals("Row count should match", expected.getRowCount(), rowGroup.getRowCount());
        Assert.assertEquals("Compressed size should match", expected.getCompressedSize(), rowGroup.getCompressedSize());
        Assert.assertEquals("Total size should match", expected.getTotalByteSize(), rowGroup.getTotalByteSize());
        Assert.assertEquals("Start pos should be at the last row group's end", nextStart, rowGroup.getStartingPos());
        assertColumnsEquivalent(expected.getColumns(), rowGroup.getColumns());
        nextStart = rowGroup.getStartingPos() + rowGroup.getTotalByteSize();
    }
}
Also used : Path(org.apache.hadoop.fs.Path) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) LinkedList(java.util.LinkedList) Test(org.junit.Test)

Example 62 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project parquet-mr by apache.

the class TestThriftToParquetFileWriter method createRecordReader.

private ParquetReader<Group> createRecordReader(Path parquetFilePath) throws IOException {
    Configuration configuration = new Configuration(true);
    GroupReadSupport readSupport = new GroupReadSupport();
    ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, parquetFilePath);
    MessageType schema = readFooter.getFileMetaData().getSchema();
    readSupport.init(configuration, null, schema);
    return new ParquetReader<Group>(parquetFilePath, readSupport);
}
Also used : GroupReadSupport(org.apache.parquet.hadoop.example.GroupReadSupport) Configuration(org.apache.hadoop.conf.Configuration) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ParquetReader(org.apache.parquet.hadoop.ParquetReader) MessageType(org.apache.parquet.schema.MessageType)

Example 63 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project drill by axbaretto.

the class TestParquetWriter method runTestAndValidate.

public void runTestAndValidate(String selection, String validationSelection, String inputTable, String outputFile) throws Exception {
    try {
        deleteTableIfExists(outputFile);
        final String query = String.format("SELECT %s FROM %s", selection, inputTable);
        test("use dfs.tmp");
        test("CREATE TABLE %s AS %s", outputFile, query);
        testBuilder().unOrdered().sqlQuery(query).sqlBaselineQuery("SELECT %s FROM %s", validationSelection, outputFile).go();
        Configuration hadoopConf = new Configuration();
        hadoopConf.set(FileSystem.FS_DEFAULT_NAME_KEY, FileSystem.DEFAULT_FS);
        @SuppressWarnings("resource") Path output = new Path(dirTestWatcher.getDfsTestTmpDir().getAbsolutePath(), outputFile);
        FileSystem fs = output.getFileSystem(hadoopConf);
        for (FileStatus file : fs.listStatus(output)) {
            ParquetMetadata footer = ParquetFileReader.readFooter(hadoopConf, file, SKIP_ROW_GROUPS);
            String version = footer.getFileMetaData().getKeyValueMetaData().get(DRILL_VERSION_PROPERTY);
            assertEquals(DrillVersionInfo.getVersion(), version);
        }
    } finally {
        deleteTableIfExists(outputFile);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) Configuration(org.apache.hadoop.conf.Configuration) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) FileSystem(org.apache.hadoop.fs.FileSystem)

Example 64 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project drill by axbaretto.

the class Metadata method getParquetFileMetadata_v3.

/**
 * Get the metadata for a single file
 */
private ParquetFileMetadata_v3 getParquetFileMetadata_v3(ParquetTableMetadata_v3 parquetTableMetadata, final FileStatus file) throws IOException, InterruptedException {
    final ParquetMetadata metadata;
    final UserGroupInformation processUserUgi = ImpersonationUtil.getProcessUserUGI();
    try {
        metadata = processUserUgi.doAs(new PrivilegedExceptionAction<ParquetMetadata>() {

            public ParquetMetadata run() throws Exception {
                return ParquetFileReader.readFooter(fs.getConf(), file);
            }
        });
    } catch (Exception e) {
        logger.error("Exception while reading footer of parquet file [Details - path: {}, owner: {}] as process user {}", file.getPath(), file.getOwner(), processUserUgi.getShortUserName(), e);
        throw e;
    }
    MessageType schema = metadata.getFileMetaData().getSchema();
    // Map<SchemaPath, OriginalType> originalTypeMap = Maps.newHashMap();
    Map<SchemaPath, ColTypeInfo> colTypeInfoMap = Maps.newHashMap();
    schema.getPaths();
    for (String[] path : schema.getPaths()) {
        colTypeInfoMap.put(SchemaPath.getCompoundPath(path), getColTypeInfo(schema, schema, path, 0));
    }
    List<RowGroupMetadata_v3> rowGroupMetadataList = Lists.newArrayList();
    ArrayList<SchemaPath> ALL_COLS = new ArrayList<>();
    ALL_COLS.add(SchemaPath.STAR_COLUMN);
    boolean autoCorrectCorruptDates = formatConfig.areCorruptDatesAutoCorrected();
    ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(metadata, ALL_COLS, autoCorrectCorruptDates);
    if (logger.isDebugEnabled()) {
        logger.debug(containsCorruptDates.toString());
    }
    for (BlockMetaData rowGroup : metadata.getBlocks()) {
        List<ColumnMetadata_v3> columnMetadataList = Lists.newArrayList();
        long length = 0;
        for (ColumnChunkMetaData col : rowGroup.getColumns()) {
            ColumnMetadata_v3 columnMetadata;
            boolean statsAvailable = (col.getStatistics() != null && !col.getStatistics().isEmpty());
            Statistics<?> stats = col.getStatistics();
            String[] columnName = col.getPath().toArray();
            SchemaPath columnSchemaName = SchemaPath.getCompoundPath(columnName);
            ColTypeInfo colTypeInfo = colTypeInfoMap.get(columnSchemaName);
            ColumnTypeMetadata_v3 columnTypeMetadata = new ColumnTypeMetadata_v3(columnName, col.getType(), colTypeInfo.originalType, colTypeInfo.precision, colTypeInfo.scale, colTypeInfo.repetitionLevel, colTypeInfo.definitionLevel);
            if (parquetTableMetadata.columnTypeInfo == null) {
                parquetTableMetadata.columnTypeInfo = new ConcurrentHashMap<>();
            }
            // Save the column schema info. We'll merge it into one list
            parquetTableMetadata.columnTypeInfo.put(new ColumnTypeMetadata_v3.Key(columnTypeMetadata.name), columnTypeMetadata);
            if (statsAvailable) {
                // Write stats when they are not null
                Object minValue = null;
                Object maxValue = null;
                if (stats.genericGetMax() != null && stats.genericGetMin() != null) {
                    minValue = stats.genericGetMin();
                    maxValue = stats.genericGetMax();
                    if (containsCorruptDates == ParquetReaderUtility.DateCorruptionStatus.META_SHOWS_CORRUPTION && columnTypeMetadata.originalType == OriginalType.DATE) {
                        minValue = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) minValue);
                        maxValue = ParquetReaderUtility.autoCorrectCorruptedDate((Integer) maxValue);
                    }
                }
                columnMetadata = new ColumnMetadata_v3(columnTypeMetadata.name, col.getType(), minValue, maxValue, stats.getNumNulls());
            } else {
                columnMetadata = new ColumnMetadata_v3(columnTypeMetadata.name, col.getType(), null, null, null);
            }
            columnMetadataList.add(columnMetadata);
            length += col.getTotalSize();
        }
        // Note we still read the schema even if there are no values in the RowGroup
        if (rowGroup.getRowCount() == 0) {
            continue;
        }
        RowGroupMetadata_v3 rowGroupMeta = new RowGroupMetadata_v3(rowGroup.getStartingPos(), length, rowGroup.getRowCount(), getHostAffinity(file, rowGroup.getStartingPos(), length), columnMetadataList);
        rowGroupMetadataList.add(rowGroupMeta);
    }
    String path = Path.getPathWithoutSchemeAndAuthority(file.getPath()).toString();
    return new ParquetFileMetadata_v3(path, file.getLen(), rowGroupMetadataList);
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) ArrayList(java.util.ArrayList) SchemaPath(org.apache.drill.common.expression.SchemaPath) MessageType(org.apache.parquet.schema.MessageType) UserGroupInformation(org.apache.hadoop.security.UserGroupInformation) PrivilegedExceptionAction(java.security.PrivilegedExceptionAction) IOException(java.io.IOException) JsonProcessingException(com.fasterxml.jackson.core.JsonProcessingException)

Example 65 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project drill by axbaretto.

the class TestParquetFilterPushDown method testDatePredicateAgainstDrillCTASPost1_8WithEval.

@Test
public void testDatePredicateAgainstDrillCTASPost1_8WithEval() throws Exception {
    // The parquet file is created on drill 1.9.0-SNAPSHOT (commit id:03e8f9f3e01c56a9411bb4333e4851c92db6e410) with DRILL CTAS:
    // create table dfs.tmp.`dateTbl1_9/t1` as select cast(o_orderdate as date) as o_orderdate from cp.`tpch/orders.parquet` where o_orderdate between date '1992-01-01' and date '1992-01-03';
    final File file = dirTestWatcher.getRootDir().toPath().resolve(Paths.get("parquetFilterPush", "dateTbl1_9", "t1", "0_0_0.parquet")).toFile();
    ParquetMetadata footer = getParquetMetaData(file);
    testDatePredicateAgainstDrillCTASHelper(footer);
}
Also used : ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) File(java.io.File) Test(org.junit.Test)

Aggregations

ParquetMetadata (org.apache.parquet.hadoop.metadata.ParquetMetadata)76 Path (org.apache.hadoop.fs.Path)39 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)27 Configuration (org.apache.hadoop.conf.Configuration)21 MessageType (org.apache.parquet.schema.MessageType)21 ArrayList (java.util.ArrayList)19 IOException (java.io.IOException)18 Test (org.junit.Test)17 FileSystem (org.apache.hadoop.fs.FileSystem)16 Map (java.util.Map)11 FileMetaData (org.apache.parquet.hadoop.metadata.FileMetaData)11 File (java.io.File)10 FileStatus (org.apache.hadoop.fs.FileStatus)10 ColumnPath (org.apache.parquet.hadoop.metadata.ColumnPath)9 HashMap (java.util.HashMap)8 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)7 List (java.util.List)6 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)6 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)6 ParquetFileReader (org.apache.parquet.hadoop.ParquetFileReader)6