use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project drill by apache.
the class ParquetFileWriter method writeMetadataFile.
/**
* writes _common_metadata file, and optionally a _metadata file depending on the {@link JobSummaryLevel} provided
* @param configuration the configuration to use to get the FileSystem
* @param outputPath the directory to write the _metadata file to
* @param footers the list of footers to merge
* @param level level of summary to write
* @throws IOException if there is an error while writing
* @deprecated metadata files are not recommended and will be removed in 2.0.0
*/
@Deprecated
public static void writeMetadataFile(Configuration configuration, Path outputPath, List<Footer> footers, JobSummaryLevel level) throws IOException {
Preconditions.checkArgument(level == JobSummaryLevel.ALL || level == JobSummaryLevel.COMMON_ONLY, "Unsupported level: " + level);
FileSystem fs = outputPath.getFileSystem(configuration);
outputPath = outputPath.makeQualified(fs);
ParquetMetadata metadataFooter = mergeFooters(outputPath, footers);
if (level == JobSummaryLevel.ALL) {
writeMetadataFile(outputPath, metadataFooter, fs, PARQUET_METADATA_FILE);
}
metadataFooter.getBlocks().clear();
writeMetadataFile(outputPath, metadataFooter, fs, PARQUET_COMMON_METADATA_FILE);
}
use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project drill by apache.
the class ParquetFileWriter method mergeMetadataFiles.
/**
* Given a list of metadata files, merge them into a single ParquetMetadata
* Requires that the schemas be compatible, and the extraMetadata be exactly equal.
* @param files a list of files to merge metadata from
* @param conf a configuration
* @param keyValueMetadataMergeStrategy strategy to merge values for same key, if there are multiple
* @return merged parquet metadata for the files
* @throws IOException if there is an error while writing
* @deprecated metadata files are not recommended and will be removed in 2.0.0
*/
@Deprecated
public static ParquetMetadata mergeMetadataFiles(List<Path> files, Configuration conf, KeyValueMetadataMergeStrategy keyValueMetadataMergeStrategy) throws IOException {
Preconditions.checkArgument(!files.isEmpty(), "Cannot merge an empty list of metadata");
GlobalMetaData globalMetaData = null;
List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();
for (Path p : files) {
ParquetMetadata pmd = ParquetFileReader.readFooter(conf, p, ParquetMetadataConverter.NO_FILTER);
FileMetaData fmd = pmd.getFileMetaData();
globalMetaData = mergeInto(fmd, globalMetaData, true);
blocks.addAll(pmd.getBlocks());
}
// collapse GlobalMetaData into a single FileMetaData, which will throw if they are not compatible
return new ParquetMetadata(globalMetaData.merge(keyValueMetadataMergeStrategy), blocks);
}
use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project drill by apache.
the class ParquetFileWriter method getGlobalMetaData.
static GlobalMetaData getGlobalMetaData(List<Footer> footers, boolean strict) {
GlobalMetaData fileMetaData = null;
for (Footer footer : footers) {
ParquetMetadata currentMetadata = footer.getParquetMetadata();
fileMetaData = mergeInto(currentMetadata.getFileMetaData(), fileMetaData, strict);
}
return fileMetaData;
}
use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project drill by apache.
the class ParquetRecordReaderTest method validateFooters.
private void validateFooters(final List<Footer> metadata) {
logger.debug(metadata.toString());
assertEquals(3, metadata.size());
for (Footer footer : metadata) {
final File file = new File(footer.getFile().toUri());
assertTrue(file.getName(), file.getName().startsWith("part"));
assertTrue(file.getPath(), file.exists());
final ParquetMetadata parquetMetadata = footer.getParquetMetadata();
assertEquals(2, parquetMetadata.getBlocks().size());
final Map<String, String> keyValueMetaData = parquetMetadata.getFileMetaData().getKeyValueMetaData();
assertEquals("bar", keyValueMetaData.get("foo"));
assertEquals(footer.getFile().getName(), keyValueMetaData.get(footer.getFile().getName()));
}
}
use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project drill by apache.
the class TestParquetWriter method runTestAndValidate.
public void runTestAndValidate(String selection, String validationSelection, String inputTable, String outputFile) throws Exception {
try {
run("drop table if exists dfs.tmp.%s", outputFile);
final String query = String.format("SELECT %s FROM %s", selection, inputTable);
run("use dfs.tmp");
run("CREATE TABLE %s AS %s", outputFile, query);
testBuilder().unOrdered().sqlQuery(query).sqlBaselineQuery("SELECT %s FROM %s", validationSelection, outputFile).go();
Configuration hadoopConf = new Configuration();
hadoopConf.set(FileSystem.FS_DEFAULT_NAME_KEY, FileSystem.DEFAULT_FS);
Path output = new Path(dirTestWatcher.getDfsTestTmpDir().getAbsolutePath(), outputFile);
FileSystem fs = output.getFileSystem(hadoopConf);
for (FileStatus file : fs.listStatus(output)) {
ParquetMetadata footer = ParquetFileReader.readFooter(hadoopConf, file, SKIP_ROW_GROUPS);
String version = footer.getFileMetaData().getKeyValueMetaData().get(DRILL_VERSION_PROPERTY);
assertEquals(DrillVersionInfo.getVersion(), version);
}
} finally {
run("drop table if exists dfs.tmp.%s", outputFile);
}
}
Aggregations