Search in sources :

Example 26 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project parquet-mr by apache.

the class PrintFooter method main.

public static void main(String[] args) throws Exception {
    if (args.length != 1) {
        System.err.println("usage PrintFooter <path>");
        return;
    }
    Path path = new Path(new URI(args[0]));
    final Configuration configuration = new Configuration();
    final FileSystem fs = path.getFileSystem(configuration);
    FileStatus fileStatus = fs.getFileStatus(path);
    Path summary = new Path(fileStatus.getPath(), PARQUET_METADATA_FILE);
    if (fileStatus.isDir() && fs.exists(summary)) {
        System.out.println("reading summary file");
        FileStatus summaryStatus = fs.getFileStatus(summary);
        List<Footer> readSummaryFile = ParquetFileReader.readSummaryFile(configuration, summaryStatus);
        for (Footer footer : readSummaryFile) {
            add(footer.getParquetMetadata());
        }
    } else {
        List<FileStatus> statuses;
        if (fileStatus.isDir()) {
            System.out.println("listing files in " + fileStatus.getPath());
            statuses = Arrays.asList(fs.listStatus(fileStatus.getPath(), HiddenFileFilter.INSTANCE));
        } else {
            statuses = new ArrayList<FileStatus>();
            statuses.add(fileStatus);
        }
        System.out.println("opening " + statuses.size() + " files");
        int i = 0;
        ExecutorService threadPool = Executors.newFixedThreadPool(5);
        try {
            long t0 = System.currentTimeMillis();
            Deque<Future<ParquetMetadata>> footers = new LinkedBlockingDeque<Future<ParquetMetadata>>();
            for (final FileStatus currentFile : statuses) {
                footers.add(threadPool.submit(new Callable<ParquetMetadata>() {

                    @Override
                    public ParquetMetadata call() throws Exception {
                        try {
                            ParquetMetadata footer = ParquetFileReader.readFooter(configuration, currentFile, NO_FILTER);
                            return footer;
                        } catch (Exception e) {
                            throw new ParquetDecodingException("could not read footer", e);
                        }
                    }
                }));
            }
            int previousPercent = 0;
            int n = 60;
            System.out.print("0% [");
            for (int j = 0; j < n; j++) {
                System.out.print(" ");
            }
            System.out.print("] 100%");
            for (int j = 0; j < n + 6; j++) {
                System.out.print('\b');
            }
            while (!footers.isEmpty()) {
                Future<ParquetMetadata> futureFooter = footers.removeFirst();
                if (!futureFooter.isDone()) {
                    footers.addLast(futureFooter);
                    continue;
                }
                ParquetMetadata footer = futureFooter.get();
                int currentPercent = (++i * n / statuses.size());
                while (currentPercent > previousPercent) {
                    System.out.print("*");
                    previousPercent++;
                }
                add(footer);
            }
            System.out.println("");
            long t1 = System.currentTimeMillis();
            System.out.println("read all footers in " + (t1 - t0) + " ms");
        } finally {
            threadPool.shutdownNow();
        }
    }
    Set<Entry<ColumnDescriptor, ColStats>> entries = stats.entrySet();
    long total = 0;
    long totalUnc = 0;
    for (Entry<ColumnDescriptor, ColStats> entry : entries) {
        ColStats colStats = entry.getValue();
        total += colStats.allStats.total;
        totalUnc += colStats.uncStats.total;
    }
    for (Entry<ColumnDescriptor, ColStats> entry : entries) {
        ColStats colStats = entry.getValue();
        System.out.println(entry.getKey() + " " + percent(colStats.allStats.total, total) + "% of all space " + colStats);
    }
    System.out.println("number of blocks: " + blockCount);
    System.out.println("total data size: " + humanReadable(total) + " (raw " + humanReadable(totalUnc) + ")");
    System.out.println("total record: " + humanReadable(recordCount));
    System.out.println("average block size: " + humanReadable(total / blockCount) + " (raw " + humanReadable(totalUnc / blockCount) + ")");
    System.out.println("average record count: " + humanReadable(recordCount / blockCount));
}
Also used : ParquetDecodingException(org.apache.parquet.io.ParquetDecodingException) FileStatus(org.apache.hadoop.fs.FileStatus) LinkedBlockingDeque(java.util.concurrent.LinkedBlockingDeque) Configuration(org.apache.hadoop.conf.Configuration) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) URI(java.net.URI) Callable(java.util.concurrent.Callable) Entry(java.util.Map.Entry) FileSystem(org.apache.hadoop.fs.FileSystem) Path(org.apache.hadoop.fs.Path) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ParquetDecodingException(org.apache.parquet.io.ParquetDecodingException) ExecutorService(java.util.concurrent.ExecutorService) Future(java.util.concurrent.Future)

Example 27 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project parquet-mr by apache.

the class TestParquetMetadataConverter method testColumnOrders.

@Test
public void testColumnOrders() throws IOException {
    MessageType schema = parseMessageType("message test {" + // Normal column with type defined column order -> typeDefined
    "  optional binary binary_col;" + "  optional group map_col (MAP) {" + "    repeated group map (MAP_KEY_VALUE) {" + // Key to be hacked to have unknown column order -> undefined
    "        required binary key (UTF8);" + "        optional group list_col (LIST) {" + "          repeated group list {" + // INT96 element with type defined column order -> undefined
    "            optional int96 array_element;" + "          }" + "        }" + "    }" + "  }" + "}");
    org.apache.parquet.hadoop.metadata.FileMetaData fileMetaData = new org.apache.parquet.hadoop.metadata.FileMetaData(schema, new HashMap<String, String>(), null);
    ParquetMetadata metadata = new ParquetMetadata(fileMetaData, new ArrayList<BlockMetaData>());
    ParquetMetadataConverter converter = new ParquetMetadataConverter();
    FileMetaData formatMetadata = converter.toParquetMetadata(1, metadata);
    List<org.apache.parquet.format.ColumnOrder> columnOrders = formatMetadata.getColumn_orders();
    assertEquals(3, columnOrders.size());
    for (org.apache.parquet.format.ColumnOrder columnOrder : columnOrders) {
        assertTrue(columnOrder.isSetTYPE_ORDER());
    }
    // Simulate that thrift got a union type that is not in the generated code
    // (when the file contains a not-yet-supported column order)
    columnOrders.get(1).clear();
    MessageType resultSchema = converter.fromParquetMetadata(formatMetadata).getFileMetaData().getSchema();
    List<ColumnDescriptor> columns = resultSchema.getColumns();
    assertEquals(3, columns.size());
    assertEquals(ColumnOrder.typeDefined(), columns.get(0).getPrimitiveType().columnOrder());
    assertEquals(ColumnOrder.undefined(), columns.get(1).getPrimitiveType().columnOrder());
    assertEquals(ColumnOrder.undefined(), columns.get(2).getPrimitiveType().columnOrder());
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) MessageType(org.apache.parquet.schema.MessageType) MessageTypeParser.parseMessageType(org.apache.parquet.schema.MessageTypeParser.parseMessageType) FileMetaData(org.apache.parquet.format.FileMetaData) ColumnOrder(org.apache.parquet.schema.ColumnOrder) Test(org.junit.Test)

Example 28 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project drill by apache.

the class ParquetReaderUtility method containsComplexColumn.

/**
 * Check whether any of columns in the given list is either nested or repetitive.
 *
 * @param footer  Parquet file schema
 * @param columns list of query SchemaPath objects
 */
public static boolean containsComplexColumn(ParquetMetadata footer, List<SchemaPath> columns) {
    MessageType schema = footer.getFileMetaData().getSchema();
    if (Utilities.isStarQuery(columns)) {
        for (Type type : schema.getFields()) {
            if (!type.isPrimitive()) {
                return true;
            }
        }
        for (ColumnDescriptor col : schema.getColumns()) {
            if (col.getMaxRepetitionLevel() > 0) {
                return true;
            }
        }
        return false;
    } else {
        Map<String, ColumnDescriptor> colDescMap = ParquetReaderUtility.getColNameToColumnDescriptorMapping(footer);
        Map<String, SchemaElement> schemaElements = ParquetReaderUtility.getColNameToSchemaElementMapping(footer);
        for (SchemaPath schemaPath : columns) {
            // Schema path which is non-leaf is complex column
            if (!schemaPath.isLeaf()) {
                logger.trace("rowGroupScan contains complex column: {}", schemaPath.getUnIndexed().toString());
                return true;
            }
            // following column descriptor lookup failure may mean two cases, depending on subsequent SchemaElement lookup:
            // 1. success: queried column is complex, i.e. GroupType
            // 2. failure: queried column is not in schema and thus is non-complex
            ColumnDescriptor column = colDescMap.get(schemaPath.getUnIndexed().toString().toLowerCase());
            if (column == null) {
                SchemaElement schemaElement = schemaElements.get(schemaPath.getUnIndexed().toString().toLowerCase());
                if (schemaElement != null) {
                    return true;
                }
            } else {
                if (column.getMaxRepetitionLevel() > 0) {
                    logger.trace("rowGroupScan contains repetitive column: {}", schemaPath.getUnIndexed().toString());
                    return true;
                }
            }
        }
    }
    return false;
}
Also used : ConvertedType(org.apache.parquet.format.ConvertedType) GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) OriginalType(org.apache.parquet.schema.OriginalType) SchemaPath(org.apache.drill.common.expression.SchemaPath) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) SchemaElement(org.apache.parquet.format.SchemaElement) MessageType(org.apache.parquet.schema.MessageType)

Example 29 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project drill by apache.

the class ReadState method buildReader.

/**
 * Create the readers needed to read columns: fixed-length or variable length.
 *
 * @param reader parquet record reader
 * @param output output mutator
 */
@SuppressWarnings("unchecked")
public void buildReader(ParquetRecordReader reader, OutputMutator output) throws Exception {
    if (totalNumRecordsToRead == 0) {
        // there is no need to spend resources to init readers, when schema will be output
        for (ParquetColumnMetadata columnMetadata : schema.getColumnMetadata()) {
            columnMetadata.buildVector(output);
        }
    } else {
        List<VarLengthColumn<? extends ValueVector>> varLengthColumns = new ArrayList<>();
        // initialize all of the column read status objects
        BlockMetaData rowGroupMetadata = schema.getRowGroupMetadata();
        if (rowGroupMetadata != null) {
            Map<String, Integer> columnChunkMetadataPositionsInList = schema.buildChunkMap(rowGroupMetadata);
            for (ParquetColumnMetadata columnMetadata : schema.getColumnMetadata()) {
                ColumnDescriptor column = columnMetadata.column;
                columnMetadata.columnChunkMetaData = rowGroupMetadata.getColumns().get(columnChunkMetadataPositionsInList.get(Arrays.toString(column.getPath())));
                columnMetadata.buildVector(output);
                if (!columnMetadata.isFixedLength()) {
                    // create a reader and add it to the appropriate list
                    varLengthColumns.add(columnMetadata.makeVariableWidthReader(reader));
                } else if (columnMetadata.isRepeated()) {
                    varLengthColumns.add(columnMetadata.makeRepeatedFixedWidthReader(reader));
                } else {
                    fixedLenColumnReaders.add(columnMetadata.makeFixedWidthReader(reader));
                }
            }
            varLengthReader = new VarLenBinaryReader(reader, varLengthColumns);
        }
    }
    if (!schema.isStarQuery()) {
        schema.createNonExistentColumns(output, nullFilledVectors);
    }
}
Also used : ValueVector(org.apache.drill.exec.vector.ValueVector) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ArrayList(java.util.ArrayList)

Example 30 with ColumnDescriptor

use of org.apache.parquet.column.ColumnDescriptor in project drill by apache.

the class FileMetadataCollector method init.

private void init() throws IOException {
    long totalRowCount = 0;
    List<Metadata_V4.RowGroupMetadata_v4> rowGroupMetadataList = new ArrayList<>();
    for (BlockMetaData rowGroup : metadata.getBlocks()) {
        List<Metadata_V4.ColumnMetadata_v4> columnMetadataList = new ArrayList<>();
        long length = 0;
        totalRowCount = totalRowCount + rowGroup.getRowCount();
        for (ColumnChunkMetaData col : rowGroup.getColumns()) {
            String[] columnName = col.getPath().toArray();
            Statistics<?> stats = col.getStatistics();
            PrimitiveType.PrimitiveTypeName primitiveTypeName = col.getPrimitiveType().getPrimitiveTypeName();
            addColumnMetadata(columnName, stats, primitiveTypeName, columnMetadataList);
            length += col.getTotalSize();
        }
        // Note we still read the schema even if there are no values in the RowGroup
        if (rowGroup.getRowCount() == 0) {
            continue;
        }
        Metadata_V4.RowGroupMetadata_v4 rowGroupMeta = new Metadata_V4.RowGroupMetadata_v4(rowGroup.getStartingPos(), length, rowGroup.getRowCount(), getHostAffinity(rowGroup.getStartingPos(), length), columnMetadataList);
        rowGroupMetadataList.add(rowGroupMeta);
    }
    // add fake row group based on file schema in case when file is empty or all row groups are empty
    if (rowGroupMetadataList.isEmpty()) {
        List<Metadata_V4.ColumnMetadata_v4> columnMetadataList = new ArrayList<>();
        for (ColumnDescriptor columnDescriptor : schema.getColumns()) {
            Statistics<?> stats = Statistics.getBuilderForReading(columnDescriptor.getPrimitiveType()).withMax(null).withMin(null).withNumNulls(0).build();
            addColumnMetadata(columnDescriptor.getPath(), stats, columnDescriptor.getPrimitiveType().getPrimitiveTypeName(), columnMetadataList);
        }
        Metadata_V4.RowGroupMetadata_v4 rowGroupMeta = new Metadata_V4.RowGroupMetadata_v4(0L, 0L, 0L, getHostAffinity(0, 0L), columnMetadataList);
        rowGroupMetadataList.add(rowGroupMeta);
    }
    Path path = Path.getPathWithoutSchemeAndAuthority(file.getPath());
    Metadata_V4.ParquetFileMetadata_v4 parquetFileMetadata_v4 = new Metadata_V4.ParquetFileMetadata_v4(path, file.getLen(), rowGroupMetadataList);
    this.fileMetadata = new Metadata_V4.ParquetFileAndRowCountMetadata(parquetFileMetadata_v4, totalNullCountMap, totalRowCount);
}
Also used : Path(org.apache.hadoop.fs.Path) SchemaPath(org.apache.drill.common.expression.SchemaPath) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ArrayList(java.util.ArrayList) PrimitiveType(org.apache.parquet.schema.PrimitiveType)

Aggregations

ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)88 MessageType (org.apache.parquet.schema.MessageType)33 PrimitiveType (org.apache.parquet.schema.PrimitiveType)18 Test (org.testng.annotations.Test)18 RichColumnDescriptor (com.facebook.presto.parquet.RichColumnDescriptor)16 ArrayList (java.util.ArrayList)16 GroupType (org.apache.parquet.schema.GroupType)14 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)12 Test (org.junit.Test)12 Domain (com.facebook.presto.common.predicate.Domain)11 TupleDomain (com.facebook.presto.common.predicate.TupleDomain)11 Path (org.apache.hadoop.fs.Path)11 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)11 List (java.util.List)10 ImmutableList (com.google.common.collect.ImmutableList)9 HashMap (java.util.HashMap)9 Configuration (org.apache.hadoop.conf.Configuration)9 Type (org.apache.parquet.schema.Type)9 HiveColumnHandle (com.facebook.presto.hive.HiveColumnHandle)8 IOException (java.io.IOException)7