use of org.apache.parquet.io.InputFile in project alluxio by Alluxio.
the class ParquetReader method create.
/**
* Creates a parquet reader.
*
* @param uri the URI to the input
* @return the reader
* @throws IOException when failed to create the reader
*/
public static ParquetReader create(AlluxioURI uri) throws IOException {
Path inputPath = new JobPath(uri.getScheme(), uri.getAuthority().toString(), uri.getPath());
Configuration conf = ReadWriterUtils.readNoCacheConf();
InputFile inputFile = HadoopInputFile.fromPath(inputPath, conf);
org.apache.parquet.hadoop.ParquetReader<Record> reader = AvroParquetReader.<Record>builder(inputFile).disableCompatibility().withDataModel(GenericData.get()).withConf(conf).build();
Schema schema;
ParquetMetadata footer;
try (ParquetFileReader r = new ParquetFileReader(inputFile, ParquetReadOptions.builder().build())) {
footer = r.getFooter();
schema = new AvroSchemaConverter().convert(footer.getFileMetaData().getSchema());
}
return new ParquetReader(reader, schema, footer);
}
use of org.apache.parquet.io.InputFile in project parquet-mr by apache.
the class ShowColumnIndexCommand method run.
@Override
public int run() throws IOException {
Preconditions.checkArgument(files != null && files.size() >= 1, "A Parquet file is required.");
Preconditions.checkArgument(files.size() == 1, "Cannot process multiple Parquet files.");
InputFile in = HadoopInputFile.fromPath(qualifiedPath(files.get(0)), getConf());
if (!showColumnIndex && !showOffsetIndex) {
showColumnIndex = true;
showOffsetIndex = true;
}
Set<String> rowGroupIndexSet = new HashSet<>();
if (rowGroupIndexes != null) {
rowGroupIndexSet.addAll(rowGroupIndexes);
}
try (ParquetFileReader reader = ParquetFileReader.open(in)) {
boolean firstBlock = true;
int rowGroupIndex = 0;
for (BlockMetaData block : reader.getFooter().getBlocks()) {
if (!rowGroupIndexSet.isEmpty() && !rowGroupIndexSet.contains(Integer.toString(rowGroupIndex))) {
++rowGroupIndex;
continue;
}
if (!firstBlock) {
console.info("");
}
firstBlock = false;
console.info("row-group {}:", rowGroupIndex);
for (ColumnChunkMetaData column : getColumns(block)) {
String path = column.getPath().toDotString();
if (showColumnIndex) {
console.info("column index for column {}:", path);
ColumnIndex columnIndex = reader.readColumnIndex(column);
if (columnIndex == null) {
console.info("NONE");
} else {
console.info(columnIndex.toString());
}
}
if (showOffsetIndex) {
console.info("offset index for column {}:", path);
OffsetIndex offsetIndex = reader.readOffsetIndex(column);
if (offsetIndex == null) {
console.info("NONE");
} else {
console.info(offsetIndex.toString());
}
}
}
++rowGroupIndex;
}
}
return 0;
}
use of org.apache.parquet.io.InputFile in project parquet-mr by apache.
the class ShowFooterCommand method run.
@Override
public int run() throws IOException {
InputFile inputFile = HadoopInputFile.fromPath(qualifiedPath(target), getConf());
console.info(raw ? readRawFooter(inputFile) : readFooter(inputFile));
return 0;
}
use of org.apache.parquet.io.InputFile in project parquet-mr by apache.
the class TestUtils method readMessages.
/**
* Read messages from given file into the expected proto class.
* @param file
* @param messageClass
* @param <T>
* @return List of protobuf messages for the given type.
*/
public static <T extends MessageOrBuilder> List<T> readMessages(Path file, Class<T> messageClass) throws IOException {
InputFile inputFile = HadoopInputFile.fromPath(file, new Configuration());
ParquetReader.Builder readerBuilder = ProtoParquetReader.builder(inputFile);
if (messageClass != null) {
readerBuilder.set(ProtoReadSupport.PB_CLASS, messageClass.getName()).build();
}
try (ParquetReader reader = readerBuilder.build()) {
List<T> result = new ArrayList<T>();
boolean hasNext = true;
while (hasNext) {
T item = (T) reader.read();
if (item == null) {
hasNext = false;
} else {
result.add((T) asMessage(item));
}
}
return result;
}
}
Aggregations