use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project parquet-mr by apache.
the class ClientSideMetadataSplitStrategy method getSplits.
List<ParquetInputSplit> getSplits(Configuration configuration, List<Footer> footers, long maxSplitSize, long minSplitSize, ReadContext readContext) throws IOException {
List<ParquetInputSplit> splits = new ArrayList<ParquetInputSplit>();
Filter filter = ParquetInputFormat.getFilter(configuration);
long rowGroupsDropped = 0;
long totalRowGroups = 0;
for (Footer footer : footers) {
final Path file = footer.getFile();
LOG.debug("{}", file);
FileSystem fs = file.getFileSystem(configuration);
FileStatus fileStatus = fs.getFileStatus(file);
ParquetMetadata parquetMetaData = footer.getParquetMetadata();
List<BlockMetaData> blocks = parquetMetaData.getBlocks();
List<BlockMetaData> filteredBlocks;
totalRowGroups += blocks.size();
filteredBlocks = RowGroupFilter.filterRowGroups(filter, blocks, parquetMetaData.getFileMetaData().getSchema());
rowGroupsDropped += blocks.size() - filteredBlocks.size();
if (filteredBlocks.isEmpty()) {
continue;
}
BlockLocation[] fileBlockLocations = fs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen());
splits.addAll(generateSplits(filteredBlocks, fileBlockLocations, fileStatus, readContext.getRequestedSchema().toString(), readContext.getReadSupportMetadata(), minSplitSize, maxSplitSize));
}
if (rowGroupsDropped > 0 && totalRowGroups > 0) {
int percentDropped = (int) ((((double) rowGroupsDropped) / totalRowGroups) * 100);
LOG.info("Dropping {} row groups that do not pass filter predicate! ({}%)", rowGroupsDropped, percentDropped);
} else {
LOG.info("There were no row groups that could be dropped due to filter predicates");
}
return splits;
}
use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project parquet-mr by apache.
the class PrintFooter method main.
public static void main(String[] args) throws Exception {
if (args.length != 1) {
System.err.println("usage PrintFooter <path>");
return;
}
Path path = new Path(new URI(args[0]));
final Configuration configuration = new Configuration();
final FileSystem fs = path.getFileSystem(configuration);
FileStatus fileStatus = fs.getFileStatus(path);
Path summary = new Path(fileStatus.getPath(), PARQUET_METADATA_FILE);
if (fileStatus.isDir() && fs.exists(summary)) {
System.out.println("reading summary file");
FileStatus summaryStatus = fs.getFileStatus(summary);
List<Footer> readSummaryFile = ParquetFileReader.readSummaryFile(configuration, summaryStatus);
for (Footer footer : readSummaryFile) {
add(footer.getParquetMetadata());
}
} else {
List<FileStatus> statuses;
if (fileStatus.isDir()) {
System.out.println("listing files in " + fileStatus.getPath());
statuses = Arrays.asList(fs.listStatus(fileStatus.getPath(), HiddenFileFilter.INSTANCE));
} else {
statuses = new ArrayList<FileStatus>();
statuses.add(fileStatus);
}
System.out.println("opening " + statuses.size() + " files");
int i = 0;
ExecutorService threadPool = Executors.newFixedThreadPool(5);
try {
long t0 = System.currentTimeMillis();
Deque<Future<ParquetMetadata>> footers = new LinkedBlockingDeque<Future<ParquetMetadata>>();
for (final FileStatus currentFile : statuses) {
footers.add(threadPool.submit(new Callable<ParquetMetadata>() {
@Override
public ParquetMetadata call() throws Exception {
try {
ParquetMetadata footer = ParquetFileReader.readFooter(configuration, currentFile, NO_FILTER);
return footer;
} catch (Exception e) {
throw new ParquetDecodingException("could not read footer", e);
}
}
}));
}
int previousPercent = 0;
int n = 60;
System.out.print("0% [");
for (int j = 0; j < n; j++) {
System.out.print(" ");
}
System.out.print("] 100%");
for (int j = 0; j < n + 6; j++) {
System.out.print('\b');
}
while (!footers.isEmpty()) {
Future<ParquetMetadata> futureFooter = footers.removeFirst();
if (!futureFooter.isDone()) {
footers.addLast(futureFooter);
continue;
}
ParquetMetadata footer = futureFooter.get();
int currentPercent = (++i * n / statuses.size());
while (currentPercent > previousPercent) {
System.out.print("*");
previousPercent++;
}
add(footer);
}
System.out.println("");
long t1 = System.currentTimeMillis();
System.out.println("read all footers in " + (t1 - t0) + " ms");
} finally {
threadPool.shutdownNow();
}
}
Set<Entry<ColumnDescriptor, ColStats>> entries = stats.entrySet();
long total = 0;
long totalUnc = 0;
for (Entry<ColumnDescriptor, ColStats> entry : entries) {
ColStats colStats = entry.getValue();
total += colStats.allStats.total;
totalUnc += colStats.uncStats.total;
}
for (Entry<ColumnDescriptor, ColStats> entry : entries) {
ColStats colStats = entry.getValue();
System.out.println(entry.getKey() + " " + percent(colStats.allStats.total, total) + "% of all space " + colStats);
}
System.out.println("number of blocks: " + blockCount);
System.out.println("total data size: " + humanReadable(total) + " (raw " + humanReadable(totalUnc) + ")");
System.out.println("total record: " + humanReadable(recordCount));
System.out.println("average block size: " + humanReadable(total / blockCount) + " (raw " + humanReadable(totalUnc / blockCount) + ")");
System.out.println("average record count: " + humanReadable(recordCount / blockCount));
}
use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project parquet-mr by apache.
the class TestParquetMetadataConverter method testColumnOrders.
@Test
public void testColumnOrders() throws IOException {
MessageType schema = parseMessageType("message test {" + // Normal column with type defined column order -> typeDefined
" optional binary binary_col;" + " optional group map_col (MAP) {" + " repeated group map (MAP_KEY_VALUE) {" + // Key to be hacked to have unknown column order -> undefined
" required binary key (UTF8);" + " optional group list_col (LIST) {" + " repeated group list {" + // INT96 element with type defined column order -> undefined
" optional int96 array_element;" + " }" + " }" + " }" + " }" + "}");
org.apache.parquet.hadoop.metadata.FileMetaData fileMetaData = new org.apache.parquet.hadoop.metadata.FileMetaData(schema, new HashMap<String, String>(), null);
ParquetMetadata metadata = new ParquetMetadata(fileMetaData, new ArrayList<BlockMetaData>());
ParquetMetadataConverter converter = new ParquetMetadataConverter();
FileMetaData formatMetadata = converter.toParquetMetadata(1, metadata);
List<org.apache.parquet.format.ColumnOrder> columnOrders = formatMetadata.getColumn_orders();
assertEquals(3, columnOrders.size());
for (org.apache.parquet.format.ColumnOrder columnOrder : columnOrders) {
assertTrue(columnOrder.isSetTYPE_ORDER());
}
// Simulate that thrift got a union type that is not in the generated code
// (when the file contains a not-yet-supported column order)
columnOrders.get(1).clear();
MessageType resultSchema = converter.fromParquetMetadata(formatMetadata).getFileMetaData().getSchema();
List<ColumnDescriptor> columns = resultSchema.getColumns();
assertEquals(3, columns.size());
assertEquals(ColumnOrder.typeDefined(), columns.get(0).getPrimitiveType().columnOrder());
assertEquals(ColumnOrder.undefined(), columns.get(1).getPrimitiveType().columnOrder());
assertEquals(ColumnOrder.undefined(), columns.get(2).getPrimitiveType().columnOrder());
}
use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project parquet-mr by apache.
the class TestParquetMetadataConverter method testMetadataToJson.
@Test
public void testMetadataToJson() {
ParquetMetadata metadata = new ParquetMetadata(null, null);
assertEquals("{\"fileMetaData\":null,\"blocks\":null}", ParquetMetadata.toJSON(metadata));
assertEquals("{\n" + " \"fileMetaData\" : null,\n" + " \"blocks\" : null\n" + "}", ParquetMetadata.toPrettyJSON(metadata));
}
use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project parquet-mr by apache.
the class ParquetRecordReaderWrapper method getSplit.
/**
* gets a ParquetInputSplit corresponding to a split given by Hive
*
* @param oldSplit The split given by Hive
* @param conf The JobConf of the Hive job
* @return a ParquetInputSplit corresponding to the oldSplit
* @throws IOException if the config cannot be enhanced or if the footer cannot be read from the file
*/
protected ParquetInputSplit getSplit(final InputSplit oldSplit, final JobConf conf) throws IOException {
if (oldSplit instanceof FileSplit) {
FileSplit fileSplit = (FileSplit) oldSplit;
final long splitStart = fileSplit.getStart();
final long splitLength = fileSplit.getLength();
final Path finalPath = fileSplit.getPath();
final JobConf cloneJob = hiveBinding.pushProjectionsAndFilters(conf, finalPath.getParent());
final ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(cloneJob, finalPath, SKIP_ROW_GROUPS);
final FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
final ReadContext readContext = new DataWritableReadSupport().init(cloneJob, fileMetaData.getKeyValueMetaData(), fileMetaData.getSchema());
schemaSize = MessageTypeParser.parseMessageType(readContext.getReadSupportMetadata().get(DataWritableReadSupport.HIVE_SCHEMA_KEY)).getFieldCount();
return new ParquetInputSplit(finalPath, splitStart, splitStart + splitLength, splitLength, fileSplit.getLocations(), null);
} else {
throw new IllegalArgumentException("Unknown split type: " + oldSplit);
}
}
Aggregations