Search in sources :

Example 31 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project parquet-mr by apache.

the class ClientSideMetadataSplitStrategy method getSplits.

List<ParquetInputSplit> getSplits(Configuration configuration, List<Footer> footers, long maxSplitSize, long minSplitSize, ReadContext readContext) throws IOException {
    List<ParquetInputSplit> splits = new ArrayList<ParquetInputSplit>();
    Filter filter = ParquetInputFormat.getFilter(configuration);
    long rowGroupsDropped = 0;
    long totalRowGroups = 0;
    for (Footer footer : footers) {
        final Path file = footer.getFile();
        LOG.debug("{}", file);
        FileSystem fs = file.getFileSystem(configuration);
        FileStatus fileStatus = fs.getFileStatus(file);
        ParquetMetadata parquetMetaData = footer.getParquetMetadata();
        List<BlockMetaData> blocks = parquetMetaData.getBlocks();
        List<BlockMetaData> filteredBlocks;
        totalRowGroups += blocks.size();
        filteredBlocks = RowGroupFilter.filterRowGroups(filter, blocks, parquetMetaData.getFileMetaData().getSchema());
        rowGroupsDropped += blocks.size() - filteredBlocks.size();
        if (filteredBlocks.isEmpty()) {
            continue;
        }
        BlockLocation[] fileBlockLocations = fs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen());
        splits.addAll(generateSplits(filteredBlocks, fileBlockLocations, fileStatus, readContext.getRequestedSchema().toString(), readContext.getReadSupportMetadata(), minSplitSize, maxSplitSize));
    }
    if (rowGroupsDropped > 0 && totalRowGroups > 0) {
        int percentDropped = (int) ((((double) rowGroupsDropped) / totalRowGroups) * 100);
        LOG.info("Dropping {} row groups that do not pass filter predicate! ({}%)", rowGroupsDropped, percentDropped);
    } else {
        LOG.info("There were no row groups that could be dropped due to filter predicates");
    }
    return splits;
}
Also used : Path(org.apache.hadoop.fs.Path) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) FileStatus(org.apache.hadoop.fs.FileStatus) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ArrayList(java.util.ArrayList) BlockLocation(org.apache.hadoop.fs.BlockLocation) HiddenFileFilter(org.apache.parquet.hadoop.util.HiddenFileFilter) Filter(org.apache.parquet.filter2.compat.FilterCompat.Filter) PathFilter(org.apache.hadoop.fs.PathFilter) RowGroupFilter(org.apache.parquet.filter2.compat.RowGroupFilter) UnboundRecordFilter(org.apache.parquet.filter.UnboundRecordFilter) FileSystem(org.apache.hadoop.fs.FileSystem)

Example 32 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project parquet-mr by apache.

the class PrintFooter method main.

public static void main(String[] args) throws Exception {
    if (args.length != 1) {
        System.err.println("usage PrintFooter <path>");
        return;
    }
    Path path = new Path(new URI(args[0]));
    final Configuration configuration = new Configuration();
    final FileSystem fs = path.getFileSystem(configuration);
    FileStatus fileStatus = fs.getFileStatus(path);
    Path summary = new Path(fileStatus.getPath(), PARQUET_METADATA_FILE);
    if (fileStatus.isDir() && fs.exists(summary)) {
        System.out.println("reading summary file");
        FileStatus summaryStatus = fs.getFileStatus(summary);
        List<Footer> readSummaryFile = ParquetFileReader.readSummaryFile(configuration, summaryStatus);
        for (Footer footer : readSummaryFile) {
            add(footer.getParquetMetadata());
        }
    } else {
        List<FileStatus> statuses;
        if (fileStatus.isDir()) {
            System.out.println("listing files in " + fileStatus.getPath());
            statuses = Arrays.asList(fs.listStatus(fileStatus.getPath(), HiddenFileFilter.INSTANCE));
        } else {
            statuses = new ArrayList<FileStatus>();
            statuses.add(fileStatus);
        }
        System.out.println("opening " + statuses.size() + " files");
        int i = 0;
        ExecutorService threadPool = Executors.newFixedThreadPool(5);
        try {
            long t0 = System.currentTimeMillis();
            Deque<Future<ParquetMetadata>> footers = new LinkedBlockingDeque<Future<ParquetMetadata>>();
            for (final FileStatus currentFile : statuses) {
                footers.add(threadPool.submit(new Callable<ParquetMetadata>() {

                    @Override
                    public ParquetMetadata call() throws Exception {
                        try {
                            ParquetMetadata footer = ParquetFileReader.readFooter(configuration, currentFile, NO_FILTER);
                            return footer;
                        } catch (Exception e) {
                            throw new ParquetDecodingException("could not read footer", e);
                        }
                    }
                }));
            }
            int previousPercent = 0;
            int n = 60;
            System.out.print("0% [");
            for (int j = 0; j < n; j++) {
                System.out.print(" ");
            }
            System.out.print("] 100%");
            for (int j = 0; j < n + 6; j++) {
                System.out.print('\b');
            }
            while (!footers.isEmpty()) {
                Future<ParquetMetadata> futureFooter = footers.removeFirst();
                if (!futureFooter.isDone()) {
                    footers.addLast(futureFooter);
                    continue;
                }
                ParquetMetadata footer = futureFooter.get();
                int currentPercent = (++i * n / statuses.size());
                while (currentPercent > previousPercent) {
                    System.out.print("*");
                    previousPercent++;
                }
                add(footer);
            }
            System.out.println("");
            long t1 = System.currentTimeMillis();
            System.out.println("read all footers in " + (t1 - t0) + " ms");
        } finally {
            threadPool.shutdownNow();
        }
    }
    Set<Entry<ColumnDescriptor, ColStats>> entries = stats.entrySet();
    long total = 0;
    long totalUnc = 0;
    for (Entry<ColumnDescriptor, ColStats> entry : entries) {
        ColStats colStats = entry.getValue();
        total += colStats.allStats.total;
        totalUnc += colStats.uncStats.total;
    }
    for (Entry<ColumnDescriptor, ColStats> entry : entries) {
        ColStats colStats = entry.getValue();
        System.out.println(entry.getKey() + " " + percent(colStats.allStats.total, total) + "% of all space " + colStats);
    }
    System.out.println("number of blocks: " + blockCount);
    System.out.println("total data size: " + humanReadable(total) + " (raw " + humanReadable(totalUnc) + ")");
    System.out.println("total record: " + humanReadable(recordCount));
    System.out.println("average block size: " + humanReadable(total / blockCount) + " (raw " + humanReadable(totalUnc / blockCount) + ")");
    System.out.println("average record count: " + humanReadable(recordCount / blockCount));
}
Also used : ParquetDecodingException(org.apache.parquet.io.ParquetDecodingException) FileStatus(org.apache.hadoop.fs.FileStatus) LinkedBlockingDeque(java.util.concurrent.LinkedBlockingDeque) Configuration(org.apache.hadoop.conf.Configuration) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) URI(java.net.URI) Callable(java.util.concurrent.Callable) Entry(java.util.Map.Entry) FileSystem(org.apache.hadoop.fs.FileSystem) Path(org.apache.hadoop.fs.Path) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ParquetDecodingException(org.apache.parquet.io.ParquetDecodingException) ExecutorService(java.util.concurrent.ExecutorService) Future(java.util.concurrent.Future)

Example 33 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project parquet-mr by apache.

the class TestParquetMetadataConverter method testColumnOrders.

@Test
public void testColumnOrders() throws IOException {
    MessageType schema = parseMessageType("message test {" + // Normal column with type defined column order -> typeDefined
    "  optional binary binary_col;" + "  optional group map_col (MAP) {" + "    repeated group map (MAP_KEY_VALUE) {" + // Key to be hacked to have unknown column order -> undefined
    "        required binary key (UTF8);" + "        optional group list_col (LIST) {" + "          repeated group list {" + // INT96 element with type defined column order -> undefined
    "            optional int96 array_element;" + "          }" + "        }" + "    }" + "  }" + "}");
    org.apache.parquet.hadoop.metadata.FileMetaData fileMetaData = new org.apache.parquet.hadoop.metadata.FileMetaData(schema, new HashMap<String, String>(), null);
    ParquetMetadata metadata = new ParquetMetadata(fileMetaData, new ArrayList<BlockMetaData>());
    ParquetMetadataConverter converter = new ParquetMetadataConverter();
    FileMetaData formatMetadata = converter.toParquetMetadata(1, metadata);
    List<org.apache.parquet.format.ColumnOrder> columnOrders = formatMetadata.getColumn_orders();
    assertEquals(3, columnOrders.size());
    for (org.apache.parquet.format.ColumnOrder columnOrder : columnOrders) {
        assertTrue(columnOrder.isSetTYPE_ORDER());
    }
    // Simulate that thrift got a union type that is not in the generated code
    // (when the file contains a not-yet-supported column order)
    columnOrders.get(1).clear();
    MessageType resultSchema = converter.fromParquetMetadata(formatMetadata).getFileMetaData().getSchema();
    List<ColumnDescriptor> columns = resultSchema.getColumns();
    assertEquals(3, columns.size());
    assertEquals(ColumnOrder.typeDefined(), columns.get(0).getPrimitiveType().columnOrder());
    assertEquals(ColumnOrder.undefined(), columns.get(1).getPrimitiveType().columnOrder());
    assertEquals(ColumnOrder.undefined(), columns.get(2).getPrimitiveType().columnOrder());
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) MessageType(org.apache.parquet.schema.MessageType) MessageTypeParser.parseMessageType(org.apache.parquet.schema.MessageTypeParser.parseMessageType) FileMetaData(org.apache.parquet.format.FileMetaData) ColumnOrder(org.apache.parquet.schema.ColumnOrder) Test(org.junit.Test)

Example 34 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project parquet-mr by apache.

the class TestParquetMetadataConverter method testMetadataToJson.

@Test
public void testMetadataToJson() {
    ParquetMetadata metadata = new ParquetMetadata(null, null);
    assertEquals("{\"fileMetaData\":null,\"blocks\":null}", ParquetMetadata.toJSON(metadata));
    assertEquals("{\n" + "  \"fileMetaData\" : null,\n" + "  \"blocks\" : null\n" + "}", ParquetMetadata.toPrettyJSON(metadata));
}
Also used : ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) Test(org.junit.Test)

Example 35 with ParquetMetadata

use of org.apache.parquet.hadoop.metadata.ParquetMetadata in project parquet-mr by apache.

the class ParquetRecordReaderWrapper method getSplit.

/**
 * gets a ParquetInputSplit corresponding to a split given by Hive
 *
 * @param oldSplit The split given by Hive
 * @param conf The JobConf of the Hive job
 * @return a ParquetInputSplit corresponding to the oldSplit
 * @throws IOException if the config cannot be enhanced or if the footer cannot be read from the file
 */
protected ParquetInputSplit getSplit(final InputSplit oldSplit, final JobConf conf) throws IOException {
    if (oldSplit instanceof FileSplit) {
        FileSplit fileSplit = (FileSplit) oldSplit;
        final long splitStart = fileSplit.getStart();
        final long splitLength = fileSplit.getLength();
        final Path finalPath = fileSplit.getPath();
        final JobConf cloneJob = hiveBinding.pushProjectionsAndFilters(conf, finalPath.getParent());
        final ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(cloneJob, finalPath, SKIP_ROW_GROUPS);
        final FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
        final ReadContext readContext = new DataWritableReadSupport().init(cloneJob, fileMetaData.getKeyValueMetaData(), fileMetaData.getSchema());
        schemaSize = MessageTypeParser.parseMessageType(readContext.getReadSupportMetadata().get(DataWritableReadSupport.HIVE_SCHEMA_KEY)).getFieldCount();
        return new ParquetInputSplit(finalPath, splitStart, splitStart + splitLength, splitLength, fileSplit.getLocations(), null);
    } else {
        throw new IllegalArgumentException("Unknown split type: " + oldSplit);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ReadContext(org.apache.parquet.hadoop.api.ReadSupport.ReadContext) ParquetInputSplit(org.apache.parquet.hadoop.ParquetInputSplit) FileSplit(org.apache.hadoop.mapred.FileSplit) JobConf(org.apache.hadoop.mapred.JobConf) FileMetaData(org.apache.parquet.hadoop.metadata.FileMetaData)

Aggregations

ParquetMetadata (org.apache.parquet.hadoop.metadata.ParquetMetadata)76 Path (org.apache.hadoop.fs.Path)39 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)27 Configuration (org.apache.hadoop.conf.Configuration)21 MessageType (org.apache.parquet.schema.MessageType)21 ArrayList (java.util.ArrayList)19 IOException (java.io.IOException)18 Test (org.junit.Test)17 FileSystem (org.apache.hadoop.fs.FileSystem)16 Map (java.util.Map)11 FileMetaData (org.apache.parquet.hadoop.metadata.FileMetaData)11 File (java.io.File)10 FileStatus (org.apache.hadoop.fs.FileStatus)10 ColumnPath (org.apache.parquet.hadoop.metadata.ColumnPath)9 HashMap (java.util.HashMap)8 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)7 List (java.util.List)6 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)6 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)6 ParquetFileReader (org.apache.parquet.hadoop.ParquetFileReader)6