use of org.apache.parquet.hadoop.Footer in project parquet-mr by apache.
the class ParquetTupleScheme method readSchema.
private MessageType readSchema(FlowProcess<? extends JobConf> flowProcess, Tap tap) {
try {
Hfs hfs;
if (tap instanceof CompositeTap)
hfs = (Hfs) ((CompositeTap) tap).getChildTaps().next();
else
hfs = (Hfs) tap;
List<Footer> footers = getFooters(flowProcess, hfs);
if (footers.isEmpty()) {
throw new TapException("Could not read Parquet metadata at " + hfs.getPath());
} else {
return footers.get(0).getParquetMetadata().getFileMetaData().getSchema();
}
} catch (IOException e) {
throw new TapException(e);
}
}
use of org.apache.parquet.hadoop.Footer in project parquet-mr by apache.
the class ParquetTupleScheme method readSchema.
private MessageType readSchema(FlowProcess<JobConf> flowProcess, Tap tap) {
try {
Hfs hfs;
if (tap instanceof CompositeTap)
hfs = (Hfs) ((CompositeTap) tap).getChildTaps().next();
else
hfs = (Hfs) tap;
List<Footer> footers = getFooters(flowProcess, hfs);
if (footers.isEmpty()) {
throw new TapException("Could not read Parquet metadata at " + hfs.getPath());
} else {
return footers.get(0).getParquetMetadata().getFileMetaData().getSchema();
}
} catch (IOException e) {
throw new TapException(e);
}
}
use of org.apache.parquet.hadoop.Footer in project drill by axbaretto.
the class ParquetRecordReaderTest method validateFooters.
private void validateFooters(final List<Footer> metadata) {
logger.debug(metadata.toString());
assertEquals(3, metadata.size());
for (Footer footer : metadata) {
final File file = new File(footer.getFile().toUri());
assertTrue(file.getName(), file.getName().startsWith("part"));
assertTrue(file.getPath(), file.exists());
final ParquetMetadata parquetMetadata = footer.getParquetMetadata();
assertEquals(2, parquetMetadata.getBlocks().size());
final Map<String, String> keyValueMetaData = parquetMetadata.getFileMetaData().getKeyValueMetaData();
assertEquals("bar", keyValueMetaData.get("foo"));
assertEquals(footer.getFile().getName(), keyValueMetaData.get(footer.getFile().getName()));
}
}
use of org.apache.parquet.hadoop.Footer in project drill by apache.
the class ParquetRecordReaderTest method testPerformance.
@Test
@Ignore
public void testPerformance() throws Exception {
final DrillbitContext bitContext = mock(DrillbitContext.class);
final UserClientConnection connection = mock(UserClientConnection.class);
final DrillConfig c = DrillConfig.create();
final FunctionImplementationRegistry registry = new FunctionImplementationRegistry(c);
final FragmentContextImpl context = new FragmentContextImpl(bitContext, BitControl.PlanFragment.getDefaultInstance(), connection, registry);
final Path fileName = new Path("/tmp/parquet_test_performance.parquet");
final HashMap<String, FieldInfo> fields = new HashMap<>();
final ParquetTestProperties props = new ParquetTestProperties(1, 20 * 1000 * 1000, DEFAULT_BYTES_PER_PAGE, fields);
populateFieldInfoMap(props);
final Configuration dfsConfig = new Configuration();
final List<Footer> footers = ParquetFileReader.readFooters(dfsConfig, fileName);
final Footer f = footers.iterator().next();
final List<SchemaPath> columns = Lists.newArrayList();
columns.add(new SchemaPath("_MAP.integer", ExpressionPosition.UNKNOWN));
columns.add(new SchemaPath("_MAP.bigInt", ExpressionPosition.UNKNOWN));
columns.add(new SchemaPath("_MAP.f", ExpressionPosition.UNKNOWN));
columns.add(new SchemaPath("_MAP.d", ExpressionPosition.UNKNOWN));
columns.add(new SchemaPath("_MAP.b", ExpressionPosition.UNKNOWN));
columns.add(new SchemaPath("_MAP.bin", ExpressionPosition.UNKNOWN));
columns.add(new SchemaPath("_MAP.bin2", ExpressionPosition.UNKNOWN));
int totalRowCount = 0;
final FileSystem fs = new CachedSingleFileSystem(fileName);
final BufferAllocator allocator = RootAllocatorFactory.newRoot(c);
for (int i = 0; i < 25; i++) {
CompressionCodecFactory ccf = DrillCompressionCodecFactory.createDirectCodecFactory(dfsConfig, new ParquetDirectByteBufferAllocator(allocator), 0);
final ParquetRecordReader rr = new ParquetRecordReader(context, fileName, 0, fs, ccf, f.getParquetMetadata(), columns, ParquetReaderUtility.DateCorruptionStatus.META_SHOWS_CORRUPTION);
final TestOutputMutator mutator = new TestOutputMutator(allocator);
rr.setup(null, mutator);
final Stopwatch watch = Stopwatch.createStarted();
int rowCount = 0;
while ((rowCount = rr.next()) > 0) {
totalRowCount += rowCount;
}
rr.close();
}
allocator.close();
}
use of org.apache.parquet.hadoop.Footer in project drill by apache.
the class FooterGatherer method readFooter.
/**
* An updated footer reader that tries to read the entire footer without knowing the length.
* This should reduce the amount of seek/read roundtrips in most workloads.
* @param config configuration for file system
* @param status file status
* @return Footer
* @throws IOException
*/
public static Footer readFooter(final Configuration config, final FileStatus status) throws IOException {
final FileSystem fs = status.getPath().getFileSystem(config);
try (FSDataInputStream file = fs.open(status.getPath())) {
final long fileLength = status.getLen();
Preconditions.checkArgument(fileLength >= MIN_FILE_SIZE, "%s is not a Parquet file (too small)", status.getPath());
int len = (int) Math.min(fileLength, (long) DEFAULT_READ_SIZE);
byte[] footerBytes = new byte[len];
readFully(file, fileLength - len, footerBytes, 0, len);
checkMagicBytes(status, footerBytes, footerBytes.length - ParquetFileWriter.MAGIC.length);
final int size = BytesUtils.readIntLittleEndian(footerBytes, footerBytes.length - FOOTER_METADATA_SIZE);
if (size > footerBytes.length - FOOTER_METADATA_SIZE) {
// if the footer is larger than our initial read, we need to read the rest.
byte[] origFooterBytes = footerBytes;
int origFooterRead = origFooterBytes.length - FOOTER_METADATA_SIZE;
footerBytes = new byte[size];
readFully(file, fileLength - size - FOOTER_METADATA_SIZE, footerBytes, 0, size - origFooterRead);
System.arraycopy(origFooterBytes, 0, footerBytes, size - origFooterRead, origFooterRead);
} else {
int start = footerBytes.length - (size + FOOTER_METADATA_SIZE);
footerBytes = ArrayUtils.subarray(footerBytes, start, start + size);
}
final ByteArrayInputStream from = new ByteArrayInputStream(footerBytes);
ParquetMetadata metadata = ParquetFormatPlugin.parquetMetadataConverter.readParquetMetadata(from, NO_FILTER);
Footer footer = new Footer(status.getPath(), metadata);
return footer;
}
}
Aggregations