use of org.apache.parquet.format.FileMetaData in project parquet-mr by apache.
the class ParquetMetadataConverter method toParquetMetadata.
public FileMetaData toParquetMetadata(int currentVersion, ParquetMetadata parquetMetadata) {
List<BlockMetaData> blocks = parquetMetadata.getBlocks();
List<RowGroup> rowGroups = new ArrayList<RowGroup>();
long numRows = 0;
for (BlockMetaData block : blocks) {
numRows += block.getRowCount();
addRowGroup(parquetMetadata, rowGroups, block);
}
FileMetaData fileMetaData = new FileMetaData(currentVersion, toParquetSchema(parquetMetadata.getFileMetaData().getSchema()), numRows, rowGroups);
Set<Entry<String, String>> keyValues = parquetMetadata.getFileMetaData().getKeyValueMetaData().entrySet();
for (Entry<String, String> keyValue : keyValues) {
addKeyValue(fileMetaData, keyValue.getKey(), keyValue.getValue());
}
fileMetaData.setCreated_by(parquetMetadata.getFileMetaData().getCreatedBy());
fileMetaData.setColumn_orders(getColumnOrders(parquetMetadata.getFileMetaData().getSchema()));
return fileMetaData;
}
use of org.apache.parquet.format.FileMetaData in project parquet-mr by apache.
the class TestParquetMetadataConverter method testNullFieldMetadataDebugLogging.
@Test
public void testNullFieldMetadataDebugLogging() {
MessageType schema = parseMessageType("message test { optional binary some_null_field; }");
org.apache.parquet.hadoop.metadata.FileMetaData fileMetaData = new org.apache.parquet.hadoop.metadata.FileMetaData(schema, new HashMap<String, String>(), null);
List<BlockMetaData> blockMetaDataList = new ArrayList<BlockMetaData>();
BlockMetaData blockMetaData = new BlockMetaData();
blockMetaData.addColumn(createColumnChunkMetaData());
blockMetaDataList.add(blockMetaData);
ParquetMetadata metadata = new ParquetMetadata(fileMetaData, blockMetaDataList);
ParquetMetadata.toJSON(metadata);
}
use of org.apache.parquet.format.FileMetaData in project parquet-mr by apache.
the class TestParquetMetadataConverter method testColumnOrders.
@Test
public void testColumnOrders() throws IOException {
MessageType schema = parseMessageType("message test {" + // Normal column with type defined column order -> typeDefined
" optional binary binary_col;" + " optional group map_col (MAP) {" + " repeated group map (MAP_KEY_VALUE) {" + // Key to be hacked to have unknown column order -> undefined
" required binary key (UTF8);" + " optional group list_col (LIST) {" + " repeated group list {" + // INT96 element with type defined column order -> undefined
" optional int96 array_element;" + " }" + " }" + " }" + " }" + "}");
org.apache.parquet.hadoop.metadata.FileMetaData fileMetaData = new org.apache.parquet.hadoop.metadata.FileMetaData(schema, new HashMap<String, String>(), null);
ParquetMetadata metadata = new ParquetMetadata(fileMetaData, new ArrayList<BlockMetaData>());
ParquetMetadataConverter converter = new ParquetMetadataConverter();
FileMetaData formatMetadata = converter.toParquetMetadata(1, metadata);
List<org.apache.parquet.format.ColumnOrder> columnOrders = formatMetadata.getColumn_orders();
assertEquals(3, columnOrders.size());
for (org.apache.parquet.format.ColumnOrder columnOrder : columnOrders) {
assertTrue(columnOrder.isSetTYPE_ORDER());
}
// Simulate that thrift got a union type that is not in the generated code
// (when the file contains a not-yet-supported column order)
columnOrders.get(1).clear();
MessageType resultSchema = converter.fromParquetMetadata(formatMetadata).getFileMetaData().getSchema();
List<ColumnDescriptor> columns = resultSchema.getColumns();
assertEquals(3, columns.size());
assertEquals(ColumnOrder.typeDefined(), columns.get(0).getPrimitiveType().columnOrder());
assertEquals(ColumnOrder.undefined(), columns.get(1).getPrimitiveType().columnOrder());
assertEquals(ColumnOrder.undefined(), columns.get(2).getPrimitiveType().columnOrder());
}
use of org.apache.parquet.format.FileMetaData in project parquet-mr by apache.
the class TestParquetMetadataConverter method metadata.
private FileMetaData metadata(long... sizes) {
List<SchemaElement> schema = emptyList();
List<RowGroup> rowGroups = new ArrayList<RowGroup>();
long offset = 0;
for (long size : sizes) {
ColumnChunk columnChunk = new ColumnChunk(offset);
columnChunk.setMeta_data(new ColumnMetaData(INT32, Collections.<org.apache.parquet.format.Encoding>emptyList(), Collections.<String>emptyList(), UNCOMPRESSED, 10l, size * 2, size, offset));
rowGroups.add(new RowGroup(Arrays.asList(columnChunk), size, 1));
offset += size;
}
return new FileMetaData(1, schema, sizes.length, rowGroups);
}
use of org.apache.parquet.format.FileMetaData in project parquet-mr by apache.
the class ParquetMetadataConverter method readParquetMetadata.
public ParquetMetadata readParquetMetadata(final InputStream from, MetadataFilter filter) throws IOException {
FileMetaData fileMetaData = filter.accept(new MetadataFilterVisitor<FileMetaData, IOException>() {
@Override
public FileMetaData visit(NoFilter filter) throws IOException {
return readFileMetaData(from);
}
@Override
public FileMetaData visit(SkipMetadataFilter filter) throws IOException {
return readFileMetaData(from, true);
}
@Override
public FileMetaData visit(OffsetMetadataFilter filter) throws IOException {
return filterFileMetaDataByStart(readFileMetaData(from), filter);
}
@Override
public FileMetaData visit(RangeMetadataFilter filter) throws IOException {
return filterFileMetaDataByMidpoint(readFileMetaData(from), filter);
}
});
LOG.debug("{}", fileMetaData);
ParquetMetadata parquetMetadata = fromParquetMetadata(fileMetaData);
if (LOG.isDebugEnabled())
LOG.debug(ParquetMetadata.toPrettyJSON(parquetMetadata));
return parquetMetadata;
}
Aggregations