use of org.apache.parquet.io.InputFile in project flink by apache.
the class ParquetRowDataWriterTest method readParquetFile.
private static List<Row> readParquetFile(File file) throws IOException {
InputFile inFile = HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(file.toURI()), new Configuration());
ArrayList<Row> results = new ArrayList<>();
try (ParquetReader<GenericRecord> reader = AvroParquetReader.<GenericRecord>builder(inFile).build()) {
GenericRecord next;
while ((next = reader.read()) != null) {
Integer c0 = (Integer) ((ArrayList<GenericData.Record>) next.get(0)).get(0).get(0);
HashMap<Utf8, Utf8> map = ((HashMap<Utf8, Utf8>) next.get(1));
String c21 = ((GenericData.Record) next.get(2)).get(0).toString();
Integer c22 = (Integer) ((GenericData.Record) next.get(2)).get(1);
Map<String, String> c1 = new HashMap<>();
for (Utf8 key : map.keySet()) {
String k = key == null ? null : key.toString();
String v = map.get(key) == null ? null : map.get(key).toString();
c1.put(k, v);
}
Row row = Row.of(new Integer[] { c0 }, c1, Row.of(c21, c22));
results.add(row);
}
}
return results;
}
use of org.apache.parquet.io.InputFile in project parquet-mr by apache.
the class ParquetReader method initReader.
private void initReader() throws IOException {
if (reader != null) {
reader.close();
reader = null;
}
if (filesIterator.hasNext()) {
InputFile file = filesIterator.next();
ParquetFileReader fileReader = ParquetFileReader.open(file, options);
reader = new InternalParquetRecordReader<>(readSupport, options.getRecordFilter());
reader.initialize(fileReader, options);
}
}
use of org.apache.parquet.io.InputFile in project parquet-mr by apache.
the class TestDataPageV1Checksums method testCorruptedPage.
/**
* Test whether corruption in the page content is detected by checksum verification
*/
@Test
public void testCorruptedPage() throws IOException {
Configuration conf = new Configuration();
conf.setBoolean(ParquetOutputFormat.PAGE_WRITE_CHECKSUM_ENABLED, true);
Path path = writeSimpleParquetFile(conf, CompressionCodecName.UNCOMPRESSED);
InputFile inputFile = HadoopInputFile.fromPath(path, conf);
try (SeekableInputStream inputStream = inputFile.newStream()) {
int fileLen = (int) inputFile.getLength();
byte[] fileBytes = new byte[fileLen];
inputStream.readFully(fileBytes);
inputStream.close();
// There are 4 pages in total (2 per column), we corrupt the first page of the first column
// and the second page of the second column. We do this by altering a byte roughly in the
// middle of each page to be corrupted
fileBytes[fileLen / 8]++;
fileBytes[fileLen / 8 + ((fileLen / 4) * 3)]++;
OutputFile outputFile = HadoopOutputFile.fromPath(path, conf);
try (PositionOutputStream outputStream = outputFile.createOrOverwrite(1024 * 1024)) {
outputStream.write(fileBytes);
outputStream.close();
// First we disable checksum verification, the corruption will go undetected as it is in the
// data section of the page
conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, false);
try (ParquetFileReader reader = getParquetFileReader(path, conf, Arrays.asList(colADesc, colBDesc))) {
PageReadStore pageReadStore = reader.readNextRowGroup();
DataPageV1 colAPage1 = readNextPage(colADesc, pageReadStore);
assertFalse("Data in page was not corrupted", Arrays.equals(colAPage1.getBytes().toByteArray(), colAPage1Bytes));
readNextPage(colADesc, pageReadStore);
readNextPage(colBDesc, pageReadStore);
DataPageV1 colBPage2 = readNextPage(colBDesc, pageReadStore);
assertFalse("Data in page was not corrupted", Arrays.equals(colBPage2.getBytes().toByteArray(), colBPage2Bytes));
}
// Now we enable checksum verification, the corruption should be detected
conf.setBoolean(ParquetInputFormat.PAGE_VERIFY_CHECKSUM_ENABLED, true);
try (ParquetFileReader reader = getParquetFileReader(path, conf, Arrays.asList(colADesc, colBDesc))) {
// We expect an exception on the first encountered corrupt page (in readAllPages)
assertVerificationFailed(reader);
}
}
}
}
use of org.apache.parquet.io.InputFile in project parquet-mr by apache.
the class ColumnEncryptorTest method getParquetMetadata.
private ParquetMetadata getParquetMetadata(FileDecryptionProperties decryptionProperties) throws IOException {
ParquetMetadata metaData;
ParquetReadOptions readOptions = ParquetReadOptions.builder().withDecryption(decryptionProperties).build();
InputFile file = HadoopInputFile.fromPath(new Path(outputFile), conf);
try (SeekableInputStream in = file.newStream()) {
metaData = ParquetFileReader.readFooter(file, readOptions, in);
}
return metaData;
}
use of org.apache.parquet.io.InputFile in project flink by apache.
the class AvroParquetStreamingFileSinkITCase method readParquetFile.
private static <T> List<T> readParquetFile(File file, GenericData dataModel) throws IOException {
InputFile inFile = HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(file.toURI()), new Configuration());
ArrayList<T> results = new ArrayList<>();
try (ParquetReader<T> reader = AvroParquetReader.<T>builder(inFile).withDataModel(dataModel).build()) {
T next;
while ((next = reader.read()) != null) {
results.add(next);
}
}
return results;
}
Aggregations