use of org.apache.parquet.column.ColumnDescriptor in project parquet-mr by apache.
the class PrintFooter method main.
public static void main(String[] args) throws Exception {
if (args.length != 1) {
System.err.println("usage PrintFooter <path>");
return;
}
Path path = new Path(new URI(args[0]));
final Configuration configuration = new Configuration();
final FileSystem fs = path.getFileSystem(configuration);
FileStatus fileStatus = fs.getFileStatus(path);
Path summary = new Path(fileStatus.getPath(), PARQUET_METADATA_FILE);
if (fileStatus.isDir() && fs.exists(summary)) {
System.out.println("reading summary file");
FileStatus summaryStatus = fs.getFileStatus(summary);
List<Footer> readSummaryFile = ParquetFileReader.readSummaryFile(configuration, summaryStatus);
for (Footer footer : readSummaryFile) {
add(footer.getParquetMetadata());
}
} else {
List<FileStatus> statuses;
if (fileStatus.isDir()) {
System.out.println("listing files in " + fileStatus.getPath());
statuses = Arrays.asList(fs.listStatus(fileStatus.getPath(), HiddenFileFilter.INSTANCE));
} else {
statuses = new ArrayList<FileStatus>();
statuses.add(fileStatus);
}
System.out.println("opening " + statuses.size() + " files");
int i = 0;
ExecutorService threadPool = Executors.newFixedThreadPool(5);
try {
long t0 = System.currentTimeMillis();
Deque<Future<ParquetMetadata>> footers = new LinkedBlockingDeque<Future<ParquetMetadata>>();
for (final FileStatus currentFile : statuses) {
footers.add(threadPool.submit(new Callable<ParquetMetadata>() {
@Override
public ParquetMetadata call() throws Exception {
try {
ParquetMetadata footer = ParquetFileReader.readFooter(configuration, currentFile, NO_FILTER);
return footer;
} catch (Exception e) {
throw new ParquetDecodingException("could not read footer", e);
}
}
}));
}
int previousPercent = 0;
int n = 60;
System.out.print("0% [");
for (int j = 0; j < n; j++) {
System.out.print(" ");
}
System.out.print("] 100%");
for (int j = 0; j < n + 6; j++) {
System.out.print('\b');
}
while (!footers.isEmpty()) {
Future<ParquetMetadata> futureFooter = footers.removeFirst();
if (!futureFooter.isDone()) {
footers.addLast(futureFooter);
continue;
}
ParquetMetadata footer = futureFooter.get();
int currentPercent = (++i * n / statuses.size());
while (currentPercent > previousPercent) {
System.out.print("*");
previousPercent++;
}
add(footer);
}
System.out.println("");
long t1 = System.currentTimeMillis();
System.out.println("read all footers in " + (t1 - t0) + " ms");
} finally {
threadPool.shutdownNow();
}
}
Set<Entry<ColumnDescriptor, ColStats>> entries = stats.entrySet();
long total = 0;
long totalUnc = 0;
for (Entry<ColumnDescriptor, ColStats> entry : entries) {
ColStats colStats = entry.getValue();
total += colStats.allStats.total;
totalUnc += colStats.uncStats.total;
}
for (Entry<ColumnDescriptor, ColStats> entry : entries) {
ColStats colStats = entry.getValue();
System.out.println(entry.getKey() + " " + percent(colStats.allStats.total, total) + "% of all space " + colStats);
}
System.out.println("number of blocks: " + blockCount);
System.out.println("total data size: " + humanReadable(total) + " (raw " + humanReadable(totalUnc) + ")");
System.out.println("total record: " + humanReadable(recordCount));
System.out.println("average block size: " + humanReadable(total / blockCount) + " (raw " + humanReadable(totalUnc / blockCount) + ")");
System.out.println("average record count: " + humanReadable(recordCount / blockCount));
}
use of org.apache.parquet.column.ColumnDescriptor in project parquet-mr by apache.
the class TestParquetMetadataConverter method testColumnOrders.
@Test
public void testColumnOrders() throws IOException {
MessageType schema = parseMessageType("message test {" + // Normal column with type defined column order -> typeDefined
" optional binary binary_col;" + " optional group map_col (MAP) {" + " repeated group map (MAP_KEY_VALUE) {" + // Key to be hacked to have unknown column order -> undefined
" required binary key (UTF8);" + " optional group list_col (LIST) {" + " repeated group list {" + // INT96 element with type defined column order -> undefined
" optional int96 array_element;" + " }" + " }" + " }" + " }" + "}");
org.apache.parquet.hadoop.metadata.FileMetaData fileMetaData = new org.apache.parquet.hadoop.metadata.FileMetaData(schema, new HashMap<String, String>(), null);
ParquetMetadata metadata = new ParquetMetadata(fileMetaData, new ArrayList<BlockMetaData>());
ParquetMetadataConverter converter = new ParquetMetadataConverter();
FileMetaData formatMetadata = converter.toParquetMetadata(1, metadata);
List<org.apache.parquet.format.ColumnOrder> columnOrders = formatMetadata.getColumn_orders();
assertEquals(3, columnOrders.size());
for (org.apache.parquet.format.ColumnOrder columnOrder : columnOrders) {
assertTrue(columnOrder.isSetTYPE_ORDER());
}
// Simulate that thrift got a union type that is not in the generated code
// (when the file contains a not-yet-supported column order)
columnOrders.get(1).clear();
MessageType resultSchema = converter.fromParquetMetadata(formatMetadata).getFileMetaData().getSchema();
List<ColumnDescriptor> columns = resultSchema.getColumns();
assertEquals(3, columns.size());
assertEquals(ColumnOrder.typeDefined(), columns.get(0).getPrimitiveType().columnOrder());
assertEquals(ColumnOrder.undefined(), columns.get(1).getPrimitiveType().columnOrder());
assertEquals(ColumnOrder.undefined(), columns.get(2).getPrimitiveType().columnOrder());
}
use of org.apache.parquet.column.ColumnDescriptor in project drill by apache.
the class ParquetReaderUtility method containsComplexColumn.
/**
* Check whether any of columns in the given list is either nested or repetitive.
*
* @param footer Parquet file schema
* @param columns list of query SchemaPath objects
*/
public static boolean containsComplexColumn(ParquetMetadata footer, List<SchemaPath> columns) {
MessageType schema = footer.getFileMetaData().getSchema();
if (Utilities.isStarQuery(columns)) {
for (Type type : schema.getFields()) {
if (!type.isPrimitive()) {
return true;
}
}
for (ColumnDescriptor col : schema.getColumns()) {
if (col.getMaxRepetitionLevel() > 0) {
return true;
}
}
return false;
} else {
Map<String, ColumnDescriptor> colDescMap = ParquetReaderUtility.getColNameToColumnDescriptorMapping(footer);
Map<String, SchemaElement> schemaElements = ParquetReaderUtility.getColNameToSchemaElementMapping(footer);
for (SchemaPath schemaPath : columns) {
// Schema path which is non-leaf is complex column
if (!schemaPath.isLeaf()) {
logger.trace("rowGroupScan contains complex column: {}", schemaPath.getUnIndexed().toString());
return true;
}
// following column descriptor lookup failure may mean two cases, depending on subsequent SchemaElement lookup:
// 1. success: queried column is complex, i.e. GroupType
// 2. failure: queried column is not in schema and thus is non-complex
ColumnDescriptor column = colDescMap.get(schemaPath.getUnIndexed().toString().toLowerCase());
if (column == null) {
SchemaElement schemaElement = schemaElements.get(schemaPath.getUnIndexed().toString().toLowerCase());
if (schemaElement != null) {
return true;
}
} else {
if (column.getMaxRepetitionLevel() > 0) {
logger.trace("rowGroupScan contains repetitive column: {}", schemaPath.getUnIndexed().toString());
return true;
}
}
}
}
return false;
}
use of org.apache.parquet.column.ColumnDescriptor in project drill by apache.
the class ReadState method buildReader.
/**
* Create the readers needed to read columns: fixed-length or variable length.
*
* @param reader parquet record reader
* @param output output mutator
*/
@SuppressWarnings("unchecked")
public void buildReader(ParquetRecordReader reader, OutputMutator output) throws Exception {
if (totalNumRecordsToRead == 0) {
// there is no need to spend resources to init readers, when schema will be output
for (ParquetColumnMetadata columnMetadata : schema.getColumnMetadata()) {
columnMetadata.buildVector(output);
}
} else {
List<VarLengthColumn<? extends ValueVector>> varLengthColumns = new ArrayList<>();
// initialize all of the column read status objects
BlockMetaData rowGroupMetadata = schema.getRowGroupMetadata();
if (rowGroupMetadata != null) {
Map<String, Integer> columnChunkMetadataPositionsInList = schema.buildChunkMap(rowGroupMetadata);
for (ParquetColumnMetadata columnMetadata : schema.getColumnMetadata()) {
ColumnDescriptor column = columnMetadata.column;
columnMetadata.columnChunkMetaData = rowGroupMetadata.getColumns().get(columnChunkMetadataPositionsInList.get(Arrays.toString(column.getPath())));
columnMetadata.buildVector(output);
if (!columnMetadata.isFixedLength()) {
// create a reader and add it to the appropriate list
varLengthColumns.add(columnMetadata.makeVariableWidthReader(reader));
} else if (columnMetadata.isRepeated()) {
varLengthColumns.add(columnMetadata.makeRepeatedFixedWidthReader(reader));
} else {
fixedLenColumnReaders.add(columnMetadata.makeFixedWidthReader(reader));
}
}
varLengthReader = new VarLenBinaryReader(reader, varLengthColumns);
}
}
if (!schema.isStarQuery()) {
schema.createNonExistentColumns(output, nullFilledVectors);
}
}
use of org.apache.parquet.column.ColumnDescriptor in project drill by apache.
the class FileMetadataCollector method init.
private void init() throws IOException {
long totalRowCount = 0;
List<Metadata_V4.RowGroupMetadata_v4> rowGroupMetadataList = new ArrayList<>();
for (BlockMetaData rowGroup : metadata.getBlocks()) {
List<Metadata_V4.ColumnMetadata_v4> columnMetadataList = new ArrayList<>();
long length = 0;
totalRowCount = totalRowCount + rowGroup.getRowCount();
for (ColumnChunkMetaData col : rowGroup.getColumns()) {
String[] columnName = col.getPath().toArray();
Statistics<?> stats = col.getStatistics();
PrimitiveType.PrimitiveTypeName primitiveTypeName = col.getPrimitiveType().getPrimitiveTypeName();
addColumnMetadata(columnName, stats, primitiveTypeName, columnMetadataList);
length += col.getTotalSize();
}
// Note we still read the schema even if there are no values in the RowGroup
if (rowGroup.getRowCount() == 0) {
continue;
}
Metadata_V4.RowGroupMetadata_v4 rowGroupMeta = new Metadata_V4.RowGroupMetadata_v4(rowGroup.getStartingPos(), length, rowGroup.getRowCount(), getHostAffinity(rowGroup.getStartingPos(), length), columnMetadataList);
rowGroupMetadataList.add(rowGroupMeta);
}
// add fake row group based on file schema in case when file is empty or all row groups are empty
if (rowGroupMetadataList.isEmpty()) {
List<Metadata_V4.ColumnMetadata_v4> columnMetadataList = new ArrayList<>();
for (ColumnDescriptor columnDescriptor : schema.getColumns()) {
Statistics<?> stats = Statistics.getBuilderForReading(columnDescriptor.getPrimitiveType()).withMax(null).withMin(null).withNumNulls(0).build();
addColumnMetadata(columnDescriptor.getPath(), stats, columnDescriptor.getPrimitiveType().getPrimitiveTypeName(), columnMetadataList);
}
Metadata_V4.RowGroupMetadata_v4 rowGroupMeta = new Metadata_V4.RowGroupMetadata_v4(0L, 0L, 0L, getHostAffinity(0, 0L), columnMetadataList);
rowGroupMetadataList.add(rowGroupMeta);
}
Path path = Path.getPathWithoutSchemeAndAuthority(file.getPath());
Metadata_V4.ParquetFileMetadata_v4 parquetFileMetadata_v4 = new Metadata_V4.ParquetFileMetadata_v4(path, file.getLen(), rowGroupMetadataList);
this.fileMetadata = new Metadata_V4.ParquetFileAndRowCountMetadata(parquetFileMetadata_v4, totalNullCountMap, totalRowCount);
}
Aggregations