use of org.apache.parquet.hadoop.Footer in project drill by apache.
the class ParquetRecordReaderTest method testPerformance.
@Test
@Ignore
public void testPerformance(@Injectable final DrillbitContext bitContext, @Injectable UserClientConnection connection) throws Exception {
final DrillConfig c = DrillConfig.create();
final FunctionImplementationRegistry registry = new FunctionImplementationRegistry(c);
final FragmentContext context = new FragmentContext(bitContext, BitControl.PlanFragment.getDefaultInstance(), connection, registry);
// new NonStrictExpectations() {
// {
// context.getAllocator(); result = BufferAllocator.getAllocator(DrillConfig.create());
// }
// };
final String fileName = "/tmp/parquet_test_performance.parquet";
final HashMap<String, FieldInfo> fields = new HashMap<>();
final ParquetTestProperties props = new ParquetTestProperties(1, 20 * 1000 * 1000, DEFAULT_BYTES_PER_PAGE, fields);
populateFieldInfoMap(props);
//generateParquetFile(fileName, props);
final Configuration dfsConfig = new Configuration();
final List<Footer> footers = ParquetFileReader.readFooters(dfsConfig, new Path(fileName));
final Footer f = footers.iterator().next();
final List<SchemaPath> columns = Lists.newArrayList();
columns.add(new SchemaPath("_MAP.integer", ExpressionPosition.UNKNOWN));
columns.add(new SchemaPath("_MAP.bigInt", ExpressionPosition.UNKNOWN));
columns.add(new SchemaPath("_MAP.f", ExpressionPosition.UNKNOWN));
columns.add(new SchemaPath("_MAP.d", ExpressionPosition.UNKNOWN));
columns.add(new SchemaPath("_MAP.b", ExpressionPosition.UNKNOWN));
columns.add(new SchemaPath("_MAP.bin", ExpressionPosition.UNKNOWN));
columns.add(new SchemaPath("_MAP.bin2", ExpressionPosition.UNKNOWN));
int totalRowCount = 0;
final FileSystem fs = new CachedSingleFileSystem(fileName);
final BufferAllocator allocator = RootAllocatorFactory.newRoot(c);
for (int i = 0; i < 25; i++) {
final ParquetRecordReader rr = new ParquetRecordReader(context, fileName, 0, fs, CodecFactory.createDirectCodecFactory(dfsConfig, new ParquetDirectByteBufferAllocator(allocator), 0), f.getParquetMetadata(), columns, ParquetReaderUtility.DateCorruptionStatus.META_SHOWS_CORRUPTION);
final TestOutputMutator mutator = new TestOutputMutator(allocator);
rr.setup(null, mutator);
final Stopwatch watch = Stopwatch.createStarted();
int rowCount = 0;
while ((rowCount = rr.next()) > 0) {
totalRowCount += rowCount;
}
System.out.println(String.format("Time completed: %s. ", watch.elapsed(TimeUnit.MILLISECONDS)));
rr.close();
}
allocator.close();
System.out.println(String.format("Total row count %s", totalRowCount));
}
use of org.apache.parquet.hadoop.Footer in project drill by axbaretto.
the class FooterGatherer method getFooters.
public static List<Footer> getFooters(final Configuration conf, List<FileStatus> statuses, int parallelism) throws IOException {
final List<TimedRunnable<Footer>> readers = Lists.newArrayList();
List<Footer> foundFooters = Lists.newArrayList();
for (FileStatus status : statuses) {
if (status.isDirectory()) {
// first we check for summary file.
FileSystem fs = status.getPath().getFileSystem(conf);
final Path summaryPath = new Path(status.getPath(), ParquetFileWriter.PARQUET_METADATA_FILE);
if (fs.exists(summaryPath)) {
FileStatus summaryStatus = fs.getFileStatus(summaryPath);
foundFooters.addAll(ParquetFileReader.readSummaryFile(conf, summaryStatus));
continue;
}
// else we handle as normal file.
for (FileStatus inStatus : DrillFileSystemUtil.listFiles(fs, status.getPath(), false)) {
readers.add(new FooterReader(conf, inStatus));
}
} else {
readers.add(new FooterReader(conf, status));
}
}
if (!readers.isEmpty()) {
foundFooters.addAll(TimedRunnable.run("Fetch Parquet Footers", logger, readers, parallelism));
}
return foundFooters;
}
use of org.apache.parquet.hadoop.Footer in project drill by axbaretto.
the class FooterGatherer method readFooter.
/**
* An updated footer reader that tries to read the entire footer without knowing the length.
* This should reduce the amount of seek/read roundtrips in most workloads.
* @param fs
* @param status
* @return
* @throws IOException
*/
public static Footer readFooter(final Configuration config, final FileStatus status) throws IOException {
final FileSystem fs = status.getPath().getFileSystem(config);
try (FSDataInputStream file = fs.open(status.getPath())) {
final long fileLength = status.getLen();
Preconditions.checkArgument(fileLength >= MIN_FILE_SIZE, "%s is not a Parquet file (too small)", status.getPath());
int len = (int) Math.min(fileLength, (long) DEFAULT_READ_SIZE);
byte[] footerBytes = new byte[len];
readFully(file, fileLength - len, footerBytes, 0, len);
checkMagicBytes(status, footerBytes, footerBytes.length - ParquetFileWriter.MAGIC.length);
final int size = BytesUtils.readIntLittleEndian(footerBytes, footerBytes.length - FOOTER_METADATA_SIZE);
if (size > footerBytes.length - FOOTER_METADATA_SIZE) {
// if the footer is larger than our initial read, we need to read the rest.
byte[] origFooterBytes = footerBytes;
int origFooterRead = origFooterBytes.length - FOOTER_METADATA_SIZE;
footerBytes = new byte[size];
readFully(file, fileLength - size - FOOTER_METADATA_SIZE, footerBytes, 0, size - origFooterRead);
System.arraycopy(origFooterBytes, 0, footerBytes, size - origFooterRead, origFooterRead);
} else {
int start = footerBytes.length - (size + FOOTER_METADATA_SIZE);
footerBytes = ArrayUtils.subarray(footerBytes, start, start + size);
}
ParquetMetadata metadata = ParquetFormatPlugin.parquetMetadataConverter.readParquetMetadata(new ByteArrayInputStream(footerBytes));
Footer footer = new Footer(status.getPath(), metadata);
return footer;
}
}
use of org.apache.parquet.hadoop.Footer in project drill by axbaretto.
the class ParquetRecordReaderTest method testPerformance.
@Test
@Ignore
public void testPerformance() throws Exception {
final DrillbitContext bitContext = mock(DrillbitContext.class);
final UserClientConnection connection = mock(UserClientConnection.class);
final DrillConfig c = DrillConfig.create();
final FunctionImplementationRegistry registry = new FunctionImplementationRegistry(c);
final FragmentContextImpl context = new FragmentContextImpl(bitContext, BitControl.PlanFragment.getDefaultInstance(), connection, registry);
final String fileName = "/tmp/parquet_test_performance.parquet";
final HashMap<String, FieldInfo> fields = new HashMap<>();
final ParquetTestProperties props = new ParquetTestProperties(1, 20 * 1000 * 1000, DEFAULT_BYTES_PER_PAGE, fields);
populateFieldInfoMap(props);
final Configuration dfsConfig = new Configuration();
final List<Footer> footers = ParquetFileReader.readFooters(dfsConfig, new Path(fileName));
final Footer f = footers.iterator().next();
final List<SchemaPath> columns = Lists.newArrayList();
columns.add(new SchemaPath("_MAP.integer", ExpressionPosition.UNKNOWN));
columns.add(new SchemaPath("_MAP.bigInt", ExpressionPosition.UNKNOWN));
columns.add(new SchemaPath("_MAP.f", ExpressionPosition.UNKNOWN));
columns.add(new SchemaPath("_MAP.d", ExpressionPosition.UNKNOWN));
columns.add(new SchemaPath("_MAP.b", ExpressionPosition.UNKNOWN));
columns.add(new SchemaPath("_MAP.bin", ExpressionPosition.UNKNOWN));
columns.add(new SchemaPath("_MAP.bin2", ExpressionPosition.UNKNOWN));
int totalRowCount = 0;
final FileSystem fs = new CachedSingleFileSystem(fileName);
final BufferAllocator allocator = RootAllocatorFactory.newRoot(c);
for (int i = 0; i < 25; i++) {
final ParquetRecordReader rr = new ParquetRecordReader(context, fileName, 0, fs, CodecFactory.createDirectCodecFactory(dfsConfig, new ParquetDirectByteBufferAllocator(allocator), 0), f.getParquetMetadata(), columns, ParquetReaderUtility.DateCorruptionStatus.META_SHOWS_CORRUPTION);
final TestOutputMutator mutator = new TestOutputMutator(allocator);
rr.setup(null, mutator);
final Stopwatch watch = Stopwatch.createStarted();
int rowCount = 0;
while ((rowCount = rr.next()) > 0) {
totalRowCount += rowCount;
}
System.out.println(String.format("Time completed: %s. ", watch.elapsed(TimeUnit.MILLISECONDS)));
rr.close();
}
allocator.close();
System.out.println(String.format("Total row count %s", totalRowCount));
}
use of org.apache.parquet.hadoop.Footer in project drill by apache.
the class FooterGatherer method getFooters.
/**
* A function to get a list of footers.
*
* @param conf configuration for file system
* @param statuses list of file statuses
* @param parallelism parallelism
* @return a list of footers
* @throws IOException
*/
public static List<Footer> getFooters(final Configuration conf, List<FileStatus> statuses, int parallelism) throws IOException {
final List<TimedCallable<Footer>> readers = new ArrayList<>();
final List<Footer> foundFooters = new ArrayList<>();
for (FileStatus status : statuses) {
if (status.isDirectory()) {
// first we check for summary file.
FileSystem fs = status.getPath().getFileSystem(conf);
final Path summaryPath = new Path(status.getPath(), ParquetFileWriter.PARQUET_METADATA_FILE);
if (fs.exists(summaryPath)) {
FileStatus summaryStatus = fs.getFileStatus(summaryPath);
foundFooters.addAll(ParquetFileReader.readSummaryFile(conf, summaryStatus));
continue;
}
// else we handle as normal file.
for (FileStatus inStatus : DrillFileSystemUtil.listFiles(fs, status.getPath(), false)) {
readers.add(new FooterReader(conf, inStatus));
}
} else {
readers.add(new FooterReader(conf, status));
}
}
if (!readers.isEmpty()) {
foundFooters.addAll(TimedCallable.run("Fetch Parquet Footers", logger, readers, parallelism));
}
return foundFooters;
}
Aggregations