use of org.apache.parquet.hadoop.ParquetFileReader in project hive by apache.
the class VectorizedParquetRecordReader method initialize.
public void initialize(ParquetInputSplit split, JobConf configuration) throws IOException, InterruptedException {
jobConf = configuration;
ParquetMetadata footer;
List<BlockMetaData> blocks;
boolean indexAccess = configuration.getBoolean(DataWritableReadSupport.PARQUET_COLUMN_INDEX_ACCESS, false);
this.file = split.getPath();
long[] rowGroupOffsets = split.getRowGroupOffsets();
String columnNames = configuration.get(IOConstants.COLUMNS);
columnNamesList = DataWritableReadSupport.getColumnNames(columnNames);
String columnTypes = configuration.get(IOConstants.COLUMNS_TYPES);
columnTypesList = DataWritableReadSupport.getColumnTypes(columnTypes);
// if task.side.metadata is set, rowGroupOffsets is null
if (rowGroupOffsets == null) {
//TODO check whether rowGroupOffSets can be null
// then we need to apply the predicate push down filter
footer = readFooter(configuration, file, range(split.getStart(), split.getEnd()));
MessageType fileSchema = footer.getFileMetaData().getSchema();
FilterCompat.Filter filter = getFilter(configuration);
blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema);
} else {
// otherwise we find the row groups that were selected on the client
footer = readFooter(configuration, file, NO_FILTER);
Set<Long> offsets = new HashSet<>();
for (long offset : rowGroupOffsets) {
offsets.add(offset);
}
blocks = new ArrayList<>();
for (BlockMetaData block : footer.getBlocks()) {
if (offsets.contains(block.getStartingPos())) {
blocks.add(block);
}
}
// verify we found them all
if (blocks.size() != rowGroupOffsets.length) {
long[] foundRowGroupOffsets = new long[footer.getBlocks().size()];
for (int i = 0; i < foundRowGroupOffsets.length; i++) {
foundRowGroupOffsets[i] = footer.getBlocks().get(i).getStartingPos();
}
// provide a good error message in case there's a bug
throw new IllegalStateException("All the offsets listed in the split should be found in the file." + " expected: " + Arrays.toString(rowGroupOffsets) + " found: " + blocks + " out of: " + Arrays.toString(foundRowGroupOffsets) + " in range " + split.getStart() + ", " + split.getEnd());
}
}
for (BlockMetaData block : blocks) {
this.totalRowCount += block.getRowCount();
}
this.fileSchema = footer.getFileMetaData().getSchema();
MessageType tableSchema;
if (indexAccess) {
List<Integer> indexSequence = new ArrayList<>();
// Generates a sequence list of indexes
for (int i = 0; i < columnNamesList.size(); i++) {
indexSequence.add(i);
}
tableSchema = DataWritableReadSupport.getSchemaByIndex(fileSchema, columnNamesList, indexSequence);
} else {
tableSchema = DataWritableReadSupport.getSchemaByName(fileSchema, columnNamesList, columnTypesList);
}
indexColumnsWanted = ColumnProjectionUtils.getReadColumnIDs(configuration);
if (!ColumnProjectionUtils.isReadAllColumns(configuration) && !indexColumnsWanted.isEmpty()) {
requestedSchema = DataWritableReadSupport.getSchemaByIndex(tableSchema, columnNamesList, indexColumnsWanted);
} else {
requestedSchema = fileSchema;
}
this.reader = new ParquetFileReader(configuration, footer.getFileMetaData(), file, blocks, requestedSchema.getColumns());
}
use of org.apache.parquet.hadoop.ParquetFileReader in project parquet-mr by apache.
the class TestArrayCompatibility method testListOfSingleElementStructsWithElementField.
@Test
public void testListOfSingleElementStructsWithElementField() throws Exception {
Path test = writeDirect("message ListOfSingleElementStructsWithElementField {" + " optional group list_of_structs (LIST) {" + " repeated group list {" + " required group element {" + " required float element;" + " }" + " }" + " }" + "}", new DirectWriter() {
@Override
public void write(RecordConsumer rc) {
rc.startMessage();
rc.startField("list_of_structs", 0);
rc.startGroup();
// start writing array contents
rc.startField("list", 0);
// write a non-null element
// array level
rc.startGroup();
rc.startField("element", 0);
// the inner element field
rc.startGroup();
rc.startField("element", 0);
rc.addFloat(33.0F);
rc.endField("element", 0);
rc.endGroup();
rc.endField("element", 0);
// array level
rc.endGroup();
// write a second non-null element
// array level
rc.startGroup();
rc.startField("element", 0);
// the inner element field
rc.startGroup();
rc.startField("element", 0);
rc.addFloat(34.0F);
rc.endField("element", 0);
rc.endGroup();
rc.endField("element", 0);
// array level
rc.endGroup();
// finished writing array contents
rc.endField("list", 0);
rc.endGroup();
rc.endField("list_of_structs", 0);
rc.endMessage();
}
});
Schema structWithElementField = record("element", field("element", primitive(Schema.Type.FLOAT)));
// old behavior - assume that the repeated type is the element type
Schema elementRecord = record("list", field("element", structWithElementField));
Schema oldSchema = record("ListOfSingleElementStructsWithElementField", optionalField("list_of_structs", array(elementRecord)));
GenericRecord oldRecord = instance(oldSchema, "list_of_structs", Arrays.asList(instance(elementRecord, "element", instance(structWithElementField, "element", 33.0F)), instance(elementRecord, "element", instance(structWithElementField, "element", 34.0F))));
// check the schema
ParquetFileReader reader = ParquetFileReader.open(new Configuration(), test);
MessageType fileSchema = reader.getFileMetaData().getSchema();
Assert.assertEquals("Converted schema should assume 2-layer structure", oldSchema, new AvroSchemaConverter(OLD_BEHAVIOR_CONF).convert(fileSchema));
// both should default to the 2-layer structure
assertReaderContains(oldBehaviorReader(test), oldSchema, oldRecord);
Schema newSchema = record("ListOfSingleElementStructsWithElementField", optionalField("list_of_structs", array(structWithElementField)));
GenericRecord newRecord = instance(newSchema, "list_of_structs", Arrays.asList(instance(structWithElementField, "element", 33.0F), instance(structWithElementField, "element", 34.0F)));
// check the schema
Assert.assertEquals("Converted schema should assume 3-layer structure", newSchema, new AvroSchemaConverter(NEW_BEHAVIOR_CONF).convert(fileSchema));
assertReaderContains(newBehaviorReader(test), newSchema, newRecord);
// check that this works with compatible nested schemas
Schema structWithDoubleElementField = record("element", field("element", primitive(Schema.Type.DOUBLE)));
Schema doubleElementRecord = record("list", field("element", structWithDoubleElementField));
Schema oldDoubleSchema = record("ListOfSingleElementStructsWithElementField", optionalField("list_of_structs", array(doubleElementRecord)));
GenericRecord oldDoubleRecord = instance(oldDoubleSchema, "list_of_structs", Arrays.asList(instance(doubleElementRecord, "element", instance(structWithDoubleElementField, "element", 33.0)), instance(doubleElementRecord, "element", instance(structWithDoubleElementField, "element", 34.0))));
assertReaderContains(oldBehaviorReader(test, oldDoubleSchema), oldDoubleSchema, oldDoubleRecord);
Schema newDoubleSchema = record("ListOfSingleElementStructsWithElementField", optionalField("list_of_structs", array(structWithDoubleElementField)));
GenericRecord newDoubleRecord = instance(newDoubleSchema, "list_of_structs", Arrays.asList(instance(structWithDoubleElementField, "element", 33.0), instance(structWithDoubleElementField, "element", 34.0)));
assertReaderContains(newBehaviorReader(test, newDoubleSchema), newDoubleSchema, newDoubleRecord);
}
use of org.apache.parquet.hadoop.ParquetFileReader in project parquet-mr by apache.
the class ShowDictionaryCommand method run.
@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
Preconditions.checkArgument(targets != null && targets.size() >= 1, "A Parquet file is required.");
Preconditions.checkArgument(targets.size() == 1, "Cannot process multiple Parquet files.");
String source = targets.get(0);
ParquetFileReader reader = ParquetFileReader.open(getConf(), qualifiedPath(source));
MessageType schema = reader.getFileMetaData().getSchema();
ColumnDescriptor descriptor = Util.descriptor(column, schema);
PrimitiveType type = Util.primitive(column, schema);
Preconditions.checkNotNull(type);
DictionaryPageReadStore dictionaryReader;
int rowGroup = 0;
while ((dictionaryReader = reader.getNextDictionaryReader()) != null) {
DictionaryPage page = dictionaryReader.readDictionaryPage(descriptor);
Dictionary dict = page.getEncoding().initDictionary(descriptor, page);
console.info("\nRow group {} dictionary for \"{}\":", rowGroup, column, page.getCompressedSize());
for (int i = 0; i <= dict.getMaxId(); i += 1) {
switch(type.getPrimitiveTypeName()) {
case BINARY:
if (type.getOriginalType() == OriginalType.UTF8) {
console.info("{}: {}", String.format("%6d", i), Util.humanReadable(dict.decodeToBinary(i).toStringUsingUTF8(), 70));
} else {
console.info("{}: {}", String.format("%6d", i), Util.humanReadable(dict.decodeToBinary(i).getBytesUnsafe(), 70));
}
break;
case INT32:
console.info("{}: {}", String.format("%6d", i), dict.decodeToInt(i));
break;
case INT64:
console.info("{}: {}", String.format("%6d", i), dict.decodeToLong(i));
break;
case FLOAT:
console.info("{}: {}", String.format("%6d", i), dict.decodeToFloat(i));
break;
case DOUBLE:
console.info("{}: {}", String.format("%6d", i), dict.decodeToDouble(i));
break;
default:
throw new IllegalArgumentException("Unknown dictionary type: " + type.getPrimitiveTypeName());
}
}
reader.skipNextRowGroup();
rowGroup += 1;
}
console.info("");
return 0;
}
use of org.apache.parquet.hadoop.ParquetFileReader in project parquet-mr by apache.
the class ShowPagesCommand method run.
@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
Preconditions.checkArgument(targets != null && targets.size() >= 1, "A Parquet file is required.");
Preconditions.checkArgument(targets.size() == 1, "Cannot process multiple Parquet files.");
String source = targets.get(0);
ParquetFileReader reader = ParquetFileReader.open(getConf(), qualifiedPath(source));
MessageType schema = reader.getFileMetaData().getSchema();
Map<ColumnDescriptor, PrimitiveType> columns = Maps.newLinkedHashMap();
if (this.columns == null || this.columns.isEmpty()) {
for (ColumnDescriptor descriptor : schema.getColumns()) {
columns.put(descriptor, primitive(schema, descriptor.getPath()));
}
} else {
for (String column : this.columns) {
columns.put(descriptor(column, schema), primitive(column, schema));
}
}
CompressionCodecName codec = reader.getRowGroups().get(0).getColumns().get(0).getCodec();
// accumulate formatted lines to print by column
Map<String, List<String>> formatted = Maps.newLinkedHashMap();
PageFormatter formatter = new PageFormatter();
PageReadStore pageStore;
int rowGroupNum = 0;
while ((pageStore = reader.readNextRowGroup()) != null) {
for (ColumnDescriptor descriptor : columns.keySet()) {
List<String> lines = formatted.get(columnName(descriptor));
if (lines == null) {
lines = Lists.newArrayList();
formatted.put(columnName(descriptor), lines);
}
formatter.setContext(rowGroupNum, columns.get(descriptor), codec);
PageReader pages = pageStore.getPageReader(descriptor);
DictionaryPage dict = pages.readDictionaryPage();
if (dict != null) {
lines.add(formatter.format(dict));
}
DataPage page;
while ((page = pages.readPage()) != null) {
lines.add(formatter.format(page));
}
}
rowGroupNum += 1;
}
// TODO: Show total column size and overall size per value in the column summary line
for (String columnName : formatted.keySet()) {
console.info(String.format("\nColumn: %s\n%s", columnName, StringUtils.leftPad("", 80, '-')));
console.info(formatter.getHeader());
for (String line : formatted.get(columnName)) {
console.info(line);
}
console.info("");
}
return 0;
}
use of org.apache.parquet.hadoop.ParquetFileReader in project parquet-mr by apache.
the class SchemaCommand method getParquetSchema.
private String getParquetSchema(String source) throws IOException {
Formats.Format format;
try (SeekableInput in = openSeekable(source)) {
format = Formats.detectFormat((InputStream) in);
in.seek(0);
switch(format) {
case PARQUET:
return new ParquetFileReader(getConf(), qualifiedPath(source), ParquetMetadataConverter.NO_FILTER).getFileMetaData().getSchema().toString();
default:
throw new IllegalArgumentException(String.format("Could not get a Parquet schema for format %s: %s", format, source));
}
}
}
Aggregations