Search in sources :

Example 11 with Footer

use of org.apache.parquet.hadoop.Footer in project parquet-mr by apache.

the class ParquetTupleScheme method readSchema.

private MessageType readSchema(FlowProcess<? extends JobConf> flowProcess, Tap tap) {
    try {
        Hfs hfs;
        if (tap instanceof CompositeTap)
            hfs = (Hfs) ((CompositeTap) tap).getChildTaps().next();
        else
            hfs = (Hfs) tap;
        List<Footer> footers = getFooters(flowProcess, hfs);
        if (footers.isEmpty()) {
            throw new TapException("Could not read Parquet metadata at " + hfs.getPath());
        } else {
            return footers.get(0).getParquetMetadata().getFileMetaData().getSchema();
        }
    } catch (IOException e) {
        throw new TapException(e);
    }
}
Also used : Hfs(cascading.tap.hadoop.Hfs) CompositeTap(cascading.tap.CompositeTap) Footer(org.apache.parquet.hadoop.Footer) TapException(cascading.tap.TapException) IOException(java.io.IOException)

Example 12 with Footer

use of org.apache.parquet.hadoop.Footer in project parquet-mr by apache.

the class ParquetTupleScheme method readSchema.

private MessageType readSchema(FlowProcess<JobConf> flowProcess, Tap tap) {
    try {
        Hfs hfs;
        if (tap instanceof CompositeTap)
            hfs = (Hfs) ((CompositeTap) tap).getChildTaps().next();
        else
            hfs = (Hfs) tap;
        List<Footer> footers = getFooters(flowProcess, hfs);
        if (footers.isEmpty()) {
            throw new TapException("Could not read Parquet metadata at " + hfs.getPath());
        } else {
            return footers.get(0).getParquetMetadata().getFileMetaData().getSchema();
        }
    } catch (IOException e) {
        throw new TapException(e);
    }
}
Also used : Hfs(cascading.tap.hadoop.Hfs) CompositeTap(cascading.tap.CompositeTap) Footer(org.apache.parquet.hadoop.Footer) TapException(cascading.tap.TapException) IOException(java.io.IOException)

Example 13 with Footer

use of org.apache.parquet.hadoop.Footer in project drill by axbaretto.

the class ParquetRecordReaderTest method validateFooters.

private void validateFooters(final List<Footer> metadata) {
    logger.debug(metadata.toString());
    assertEquals(3, metadata.size());
    for (Footer footer : metadata) {
        final File file = new File(footer.getFile().toUri());
        assertTrue(file.getName(), file.getName().startsWith("part"));
        assertTrue(file.getPath(), file.exists());
        final ParquetMetadata parquetMetadata = footer.getParquetMetadata();
        assertEquals(2, parquetMetadata.getBlocks().size());
        final Map<String, String> keyValueMetaData = parquetMetadata.getFileMetaData().getKeyValueMetaData();
        assertEquals("bar", keyValueMetaData.get("foo"));
        assertEquals(footer.getFile().getName(), keyValueMetaData.get(footer.getFile().getName()));
    }
}
Also used : ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) Footer(org.apache.parquet.hadoop.Footer) File(java.io.File)

Example 14 with Footer

use of org.apache.parquet.hadoop.Footer in project drill by apache.

the class ParquetRecordReaderTest method testPerformance.

@Test
@Ignore
public void testPerformance() throws Exception {
    final DrillbitContext bitContext = mock(DrillbitContext.class);
    final UserClientConnection connection = mock(UserClientConnection.class);
    final DrillConfig c = DrillConfig.create();
    final FunctionImplementationRegistry registry = new FunctionImplementationRegistry(c);
    final FragmentContextImpl context = new FragmentContextImpl(bitContext, BitControl.PlanFragment.getDefaultInstance(), connection, registry);
    final Path fileName = new Path("/tmp/parquet_test_performance.parquet");
    final HashMap<String, FieldInfo> fields = new HashMap<>();
    final ParquetTestProperties props = new ParquetTestProperties(1, 20 * 1000 * 1000, DEFAULT_BYTES_PER_PAGE, fields);
    populateFieldInfoMap(props);
    final Configuration dfsConfig = new Configuration();
    final List<Footer> footers = ParquetFileReader.readFooters(dfsConfig, fileName);
    final Footer f = footers.iterator().next();
    final List<SchemaPath> columns = Lists.newArrayList();
    columns.add(new SchemaPath("_MAP.integer", ExpressionPosition.UNKNOWN));
    columns.add(new SchemaPath("_MAP.bigInt", ExpressionPosition.UNKNOWN));
    columns.add(new SchemaPath("_MAP.f", ExpressionPosition.UNKNOWN));
    columns.add(new SchemaPath("_MAP.d", ExpressionPosition.UNKNOWN));
    columns.add(new SchemaPath("_MAP.b", ExpressionPosition.UNKNOWN));
    columns.add(new SchemaPath("_MAP.bin", ExpressionPosition.UNKNOWN));
    columns.add(new SchemaPath("_MAP.bin2", ExpressionPosition.UNKNOWN));
    int totalRowCount = 0;
    final FileSystem fs = new CachedSingleFileSystem(fileName);
    final BufferAllocator allocator = RootAllocatorFactory.newRoot(c);
    for (int i = 0; i < 25; i++) {
        CompressionCodecFactory ccf = DrillCompressionCodecFactory.createDirectCodecFactory(dfsConfig, new ParquetDirectByteBufferAllocator(allocator), 0);
        final ParquetRecordReader rr = new ParquetRecordReader(context, fileName, 0, fs, ccf, f.getParquetMetadata(), columns, ParquetReaderUtility.DateCorruptionStatus.META_SHOWS_CORRUPTION);
        final TestOutputMutator mutator = new TestOutputMutator(allocator);
        rr.setup(null, mutator);
        final Stopwatch watch = Stopwatch.createStarted();
        int rowCount = 0;
        while ((rowCount = rr.next()) > 0) {
            totalRowCount += rowCount;
        }
        rr.close();
    }
    allocator.close();
}
Also used : DrillbitContext(org.apache.drill.exec.server.DrillbitContext) Configuration(org.apache.hadoop.conf.Configuration) HashMap(java.util.HashMap) Stopwatch(org.apache.drill.shaded.guava.com.google.common.base.Stopwatch) FragmentContextImpl(org.apache.drill.exec.ops.FragmentContextImpl) TestOutputMutator(org.apache.drill.exec.store.TestOutputMutator) DrillConfig(org.apache.drill.common.config.DrillConfig) CompressionCodecFactory(org.apache.parquet.compression.CompressionCodecFactory) DrillCompressionCodecFactory(org.apache.drill.exec.store.parquet.compression.DrillCompressionCodecFactory) SchemaPath(org.apache.drill.common.expression.SchemaPath) FileSystem(org.apache.hadoop.fs.FileSystem) CachedSingleFileSystem(org.apache.drill.exec.store.CachedSingleFileSystem) FunctionImplementationRegistry(org.apache.drill.exec.expr.fn.FunctionImplementationRegistry) Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) BufferAllocator(org.apache.drill.exec.memory.BufferAllocator) ParquetRecordReader(org.apache.drill.exec.store.parquet.columnreaders.ParquetRecordReader) CachedSingleFileSystem(org.apache.drill.exec.store.CachedSingleFileSystem) UserClientConnection(org.apache.drill.exec.rpc.UserClientConnection) Footer(org.apache.parquet.hadoop.Footer) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 15 with Footer

use of org.apache.parquet.hadoop.Footer in project drill by apache.

the class FooterGatherer method readFooter.

/**
 * An updated footer reader that tries to read the entire footer without knowing the length.
 * This should reduce the amount of seek/read roundtrips in most workloads.
 * @param config configuration for file system
 * @param status file status
 * @return Footer
 * @throws IOException
 */
public static Footer readFooter(final Configuration config, final FileStatus status) throws IOException {
    final FileSystem fs = status.getPath().getFileSystem(config);
    try (FSDataInputStream file = fs.open(status.getPath())) {
        final long fileLength = status.getLen();
        Preconditions.checkArgument(fileLength >= MIN_FILE_SIZE, "%s is not a Parquet file (too small)", status.getPath());
        int len = (int) Math.min(fileLength, (long) DEFAULT_READ_SIZE);
        byte[] footerBytes = new byte[len];
        readFully(file, fileLength - len, footerBytes, 0, len);
        checkMagicBytes(status, footerBytes, footerBytes.length - ParquetFileWriter.MAGIC.length);
        final int size = BytesUtils.readIntLittleEndian(footerBytes, footerBytes.length - FOOTER_METADATA_SIZE);
        if (size > footerBytes.length - FOOTER_METADATA_SIZE) {
            // if the footer is larger than our initial read, we need to read the rest.
            byte[] origFooterBytes = footerBytes;
            int origFooterRead = origFooterBytes.length - FOOTER_METADATA_SIZE;
            footerBytes = new byte[size];
            readFully(file, fileLength - size - FOOTER_METADATA_SIZE, footerBytes, 0, size - origFooterRead);
            System.arraycopy(origFooterBytes, 0, footerBytes, size - origFooterRead, origFooterRead);
        } else {
            int start = footerBytes.length - (size + FOOTER_METADATA_SIZE);
            footerBytes = ArrayUtils.subarray(footerBytes, start, start + size);
        }
        final ByteArrayInputStream from = new ByteArrayInputStream(footerBytes);
        ParquetMetadata metadata = ParquetFormatPlugin.parquetMetadataConverter.readParquetMetadata(from, NO_FILTER);
        Footer footer = new Footer(status.getPath(), metadata);
        return footer;
    }
}
Also used : ByteArrayInputStream(java.io.ByteArrayInputStream) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) FileSystem(org.apache.hadoop.fs.FileSystem) Footer(org.apache.parquet.hadoop.Footer) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream)

Aggregations

Footer (org.apache.parquet.hadoop.Footer)15 Path (org.apache.hadoop.fs.Path)8 FileSystem (org.apache.hadoop.fs.FileSystem)7 Configuration (org.apache.hadoop.conf.Configuration)6 FileStatus (org.apache.hadoop.fs.FileStatus)5 ParquetMetadata (org.apache.parquet.hadoop.metadata.ParquetMetadata)4 HashMap (java.util.HashMap)3 DrillConfig (org.apache.drill.common.config.DrillConfig)3 SchemaPath (org.apache.drill.common.expression.SchemaPath)3 FunctionImplementationRegistry (org.apache.drill.exec.expr.fn.FunctionImplementationRegistry)3 BufferAllocator (org.apache.drill.exec.memory.BufferAllocator)3 CachedSingleFileSystem (org.apache.drill.exec.store.CachedSingleFileSystem)3 TestOutputMutator (org.apache.drill.exec.store.TestOutputMutator)3 ParquetRecordReader (org.apache.drill.exec.store.parquet.columnreaders.ParquetRecordReader)3 Ignore (org.junit.Ignore)3 Test (org.junit.Test)3 CompositeTap (cascading.tap.CompositeTap)2 TapException (cascading.tap.TapException)2 Hfs (cascading.tap.hadoop.Hfs)2 Stopwatch (com.google.common.base.Stopwatch)2