Search in sources :

Example 36 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project drill by apache.

the class ParquetFileWriter method writeMetadataFile.

/**
 * writes _common_metadata file, and optionally a _metadata file depending on the {@link JobSummaryLevel} provided
 * @param configuration the configuration to use to get the FileSystem
 * @param outputPath the directory to write the _metadata file to
 * @param footers the list of footers to merge
 * @param level level of summary to write
 * @throws IOException if there is an error while writing
 * @deprecated metadata files are not recommended and will be removed in 2.0.0
 */
@Deprecated
public static void writeMetadataFile(Configuration configuration, Path outputPath, List<Footer> footers, JobSummaryLevel level) throws IOException {
    Preconditions.checkArgument(level == JobSummaryLevel.ALL || level == JobSummaryLevel.COMMON_ONLY, "Unsupported level: " + level);
    FileSystem fs = outputPath.getFileSystem(configuration);
    outputPath = outputPath.makeQualified(fs);
    ParquetMetadata metadataFooter = mergeFooters(outputPath, footers);
    if (level == JobSummaryLevel.ALL) {
        writeMetadataFile(outputPath, metadataFooter, fs, PARQUET_METADATA_FILE);
    }
    metadataFooter.getBlocks().clear();
    writeMetadataFile(outputPath, metadataFooter, fs, PARQUET_COMMON_METADATA_FILE);
}
Also used : ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) FileSystem(org.apache.hadoop.fs.FileSystem)

Example 37 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project drill by apache.

the class ParquetFileWriter method mergeMetadataFiles.

/**
 * Given a list of metadata files, merge them into a single ParquetMetadata
 * Requires that the schemas be compatible, and the extraMetadata be exactly equal.
 * @param files a list of files to merge metadata from
 * @param conf a configuration
 * @param keyValueMetadataMergeStrategy strategy to merge values for same key, if there are multiple
 * @return merged parquet metadata for the files
 * @throws IOException if there is an error while writing
 * @deprecated metadata files are not recommended and will be removed in 2.0.0
 */
@Deprecated
public static ParquetMetadata mergeMetadataFiles(List<Path> files, Configuration conf, KeyValueMetadataMergeStrategy keyValueMetadataMergeStrategy) throws IOException {
    Preconditions.checkArgument(!files.isEmpty(), "Cannot merge an empty list of metadata");
    GlobalMetaData globalMetaData = null;
    List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
    for (Path p : files) {
        ParquetMetadata pmd = ParquetFileReader.readFooter(conf, p, ParquetMetadataConverter.NO_FILTER);
        FileMetaData fmd = pmd.getFileMetaData();
        globalMetaData = mergeInto(fmd, globalMetaData, true);
        blocks.addAll(pmd.getBlocks());
    }
    // collapse GlobalMetaData into a single FileMetaData, which will throw if they are not compatible
    return new ParquetMetadata(globalMetaData.merge(keyValueMetadataMergeStrategy), blocks);
}
Also used : Path(org.apache.hadoop.fs.Path) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ArrayList(java.util.ArrayList) GlobalMetaData(org.apache.parquet.hadoop.metadata.GlobalMetaData) Util.writeFileMetaData(org.apache.parquet.format.Util.writeFileMetaData) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData)

Example 38 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project drill by apache.

the class ParquetFileWriter method getGlobalMetaData.

static GlobalMetaData getGlobalMetaData(List<Footer> footers, boolean strict) {
    GlobalMetaData fileMetaData = null;
    for (Footer footer : footers) {
        ParquetMetadata currentMetadata = footer.getParquetMetadata();
        fileMetaData = mergeInto(currentMetadata.getFileMetaData(), fileMetaData, strict);
    }
    return fileMetaData;
}
Also used : ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) GlobalMetaData(org.apache.parquet.hadoop.metadata.GlobalMetaData)

Example 39 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project drill by apache.

the class ParquetRecordReaderTest method validateFooters.

private void validateFooters(final List<Footer> metadata) {
    logger.debug(metadata.toString());
    assertEquals(3, metadata.size());
    for (Footer footer : metadata) {
        final File file = new File(footer.getFile().toUri());
        assertTrue(file.getName(), file.getName().startsWith("part"));
        assertTrue(file.getPath(), file.exists());
        final ParquetMetadata parquetMetadata = footer.getParquetMetadata();
        assertEquals(2, parquetMetadata.getBlocks().size());
        final Map<String, String> keyValueMetaData = parquetMetadata.getFileMetaData().getKeyValueMetaData();
        assertEquals("bar", keyValueMetaData.get("foo"));
        assertEquals(footer.getFile().getName(), keyValueMetaData.get(footer.getFile().getName()));
    }
}
Also used : ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) Footer(org.apache.parquet.hadoop.Footer) File(java.io.File)

Example 40 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project drill by apache.

the class TestParquetWriter method runTestAndValidate.

public void runTestAndValidate(String selection, String validationSelection, String inputTable, String outputFile) throws Exception {
    try {
        run("drop table if exists dfs.tmp.%s", outputFile);
        final String query = String.format("SELECT %s FROM %s", selection, inputTable);
        run("use dfs.tmp");
        run("CREATE TABLE %s AS %s", outputFile, query);
        testBuilder().unOrdered().sqlQuery(query).sqlBaselineQuery("SELECT %s FROM %s", validationSelection, outputFile).go();
        Configuration hadoopConf = new Configuration();
        hadoopConf.set(FileSystem.FS_DEFAULT_NAME_KEY, FileSystem.DEFAULT_FS);
        Path output = new Path(dirTestWatcher.getDfsTestTmpDir().getAbsolutePath(), outputFile);
        FileSystem fs = output.getFileSystem(hadoopConf);
        for (FileStatus file : fs.listStatus(output)) {
            ParquetMetadata footer = ParquetFileReader.readFooter(hadoopConf, file, SKIP_ROW_GROUPS);
            String version = footer.getFileMetaData().getKeyValueMetaData().get(DRILL_VERSION_PROPERTY);
            assertEquals(DrillVersionInfo.getVersion(), version);
        }
    } finally {
        run("drop table if exists dfs.tmp.%s", outputFile);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) Configuration(org.apache.hadoop.conf.Configuration) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) FileSystem(org.apache.hadoop.fs.FileSystem)

Aggregations

ParquetMetadata (org.apache.parquet.hadoop.metadata.ParquetMetadata)76 Path (org.apache.hadoop.fs.Path)39 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)27 Configuration (org.apache.hadoop.conf.Configuration)21 MessageType (org.apache.parquet.schema.MessageType)21 ArrayList (java.util.ArrayList)19 IOException (java.io.IOException)18 Test (org.junit.Test)17 FileSystem (org.apache.hadoop.fs.FileSystem)16 Map (java.util.Map)11 FileMetaData (org.apache.parquet.hadoop.metadata.FileMetaData)11 File (java.io.File)10 FileStatus (org.apache.hadoop.fs.FileStatus)10 ColumnPath (org.apache.parquet.hadoop.metadata.ColumnPath)9 HashMap (java.util.HashMap)8 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)7 List (java.util.List)6 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)6 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)6 ParquetFileReader (org.apache.parquet.hadoop.ParquetFileReader)6