Search in sources :

Example 1 with ParquetFileReader

use of org.apache.parquet.hadoop.ParquetFileReader in project hive by apache.

the class VectorizedParquetRecordReader method initialize.

public void initialize(ParquetInputSplit split, JobConf configuration) throws IOException, InterruptedException {
    jobConf = configuration;
    ParquetMetadata footer;
    List<BlockMetaData> blocks;
    boolean indexAccess = configuration.getBoolean(DataWritableReadSupport.PARQUET_COLUMN_INDEX_ACCESS, false);
    this.file = split.getPath();
    long[] rowGroupOffsets = split.getRowGroupOffsets();
    String columnNames = configuration.get(IOConstants.COLUMNS);
    columnNamesList = DataWritableReadSupport.getColumnNames(columnNames);
    String columnTypes = configuration.get(IOConstants.COLUMNS_TYPES);
    columnTypesList = DataWritableReadSupport.getColumnTypes(columnTypes);
    // if task.side.metadata is set, rowGroupOffsets is null
    if (rowGroupOffsets == null) {
        //TODO check whether rowGroupOffSets can be null
        // then we need to apply the predicate push down filter
        footer = readFooter(configuration, file, range(split.getStart(), split.getEnd()));
        MessageType fileSchema = footer.getFileMetaData().getSchema();
        FilterCompat.Filter filter = getFilter(configuration);
        blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema);
    } else {
        // otherwise we find the row groups that were selected on the client
        footer = readFooter(configuration, file, NO_FILTER);
        Set<Long> offsets = new HashSet<>();
        for (long offset : rowGroupOffsets) {
            offsets.add(offset);
        }
        blocks = new ArrayList<>();
        for (BlockMetaData block : footer.getBlocks()) {
            if (offsets.contains(block.getStartingPos())) {
                blocks.add(block);
            }
        }
        // verify we found them all
        if (blocks.size() != rowGroupOffsets.length) {
            long[] foundRowGroupOffsets = new long[footer.getBlocks().size()];
            for (int i = 0; i < foundRowGroupOffsets.length; i++) {
                foundRowGroupOffsets[i] = footer.getBlocks().get(i).getStartingPos();
            }
            // provide a good error message in case there's a bug
            throw new IllegalStateException("All the offsets listed in the split should be found in the file." + " expected: " + Arrays.toString(rowGroupOffsets) + " found: " + blocks + " out of: " + Arrays.toString(foundRowGroupOffsets) + " in range " + split.getStart() + ", " + split.getEnd());
        }
    }
    for (BlockMetaData block : blocks) {
        this.totalRowCount += block.getRowCount();
    }
    this.fileSchema = footer.getFileMetaData().getSchema();
    MessageType tableSchema;
    if (indexAccess) {
        List<Integer> indexSequence = new ArrayList<>();
        // Generates a sequence list of indexes
        for (int i = 0; i < columnNamesList.size(); i++) {
            indexSequence.add(i);
        }
        tableSchema = DataWritableReadSupport.getSchemaByIndex(fileSchema, columnNamesList, indexSequence);
    } else {
        tableSchema = DataWritableReadSupport.getSchemaByName(fileSchema, columnNamesList, columnTypesList);
    }
    indexColumnsWanted = ColumnProjectionUtils.getReadColumnIDs(configuration);
    if (!ColumnProjectionUtils.isReadAllColumns(configuration) && !indexColumnsWanted.isEmpty()) {
        requestedSchema = DataWritableReadSupport.getSchemaByIndex(tableSchema, columnNamesList, indexColumnsWanted);
    } else {
        requestedSchema = fileSchema;
    }
    this.reader = new ParquetFileReader(configuration, footer.getFileMetaData(), file, blocks, requestedSchema.getColumns());
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) FilterCompat(org.apache.parquet.filter2.compat.FilterCompat) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) ArrayList(java.util.ArrayList) MessageType(org.apache.parquet.schema.MessageType) HashSet(java.util.HashSet)

Example 2 with ParquetFileReader

use of org.apache.parquet.hadoop.ParquetFileReader in project parquet-mr by apache.

the class TestArrayCompatibility method testListOfSingleElementStructsWithElementField.

@Test
public void testListOfSingleElementStructsWithElementField() throws Exception {
    Path test = writeDirect("message ListOfSingleElementStructsWithElementField {" + "  optional group list_of_structs (LIST) {" + "    repeated group list {" + "      required group element {" + "        required float element;" + "      }" + "    }" + "  }" + "}", new DirectWriter() {

        @Override
        public void write(RecordConsumer rc) {
            rc.startMessage();
            rc.startField("list_of_structs", 0);
            rc.startGroup();
            // start writing array contents
            rc.startField("list", 0);
            // write a non-null element
            // array level
            rc.startGroup();
            rc.startField("element", 0);
            // the inner element field
            rc.startGroup();
            rc.startField("element", 0);
            rc.addFloat(33.0F);
            rc.endField("element", 0);
            rc.endGroup();
            rc.endField("element", 0);
            // array level
            rc.endGroup();
            // write a second non-null element
            // array level
            rc.startGroup();
            rc.startField("element", 0);
            // the inner element field
            rc.startGroup();
            rc.startField("element", 0);
            rc.addFloat(34.0F);
            rc.endField("element", 0);
            rc.endGroup();
            rc.endField("element", 0);
            // array level
            rc.endGroup();
            // finished writing array contents
            rc.endField("list", 0);
            rc.endGroup();
            rc.endField("list_of_structs", 0);
            rc.endMessage();
        }
    });
    Schema structWithElementField = record("element", field("element", primitive(Schema.Type.FLOAT)));
    // old behavior - assume that the repeated type is the element type
    Schema elementRecord = record("list", field("element", structWithElementField));
    Schema oldSchema = record("ListOfSingleElementStructsWithElementField", optionalField("list_of_structs", array(elementRecord)));
    GenericRecord oldRecord = instance(oldSchema, "list_of_structs", Arrays.asList(instance(elementRecord, "element", instance(structWithElementField, "element", 33.0F)), instance(elementRecord, "element", instance(structWithElementField, "element", 34.0F))));
    // check the schema
    ParquetFileReader reader = ParquetFileReader.open(new Configuration(), test);
    MessageType fileSchema = reader.getFileMetaData().getSchema();
    Assert.assertEquals("Converted schema should assume 2-layer structure", oldSchema, new AvroSchemaConverter(OLD_BEHAVIOR_CONF).convert(fileSchema));
    // both should default to the 2-layer structure
    assertReaderContains(oldBehaviorReader(test), oldSchema, oldRecord);
    Schema newSchema = record("ListOfSingleElementStructsWithElementField", optionalField("list_of_structs", array(structWithElementField)));
    GenericRecord newRecord = instance(newSchema, "list_of_structs", Arrays.asList(instance(structWithElementField, "element", 33.0F), instance(structWithElementField, "element", 34.0F)));
    // check the schema
    Assert.assertEquals("Converted schema should assume 3-layer structure", newSchema, new AvroSchemaConverter(NEW_BEHAVIOR_CONF).convert(fileSchema));
    assertReaderContains(newBehaviorReader(test), newSchema, newRecord);
    // check that this works with compatible nested schemas
    Schema structWithDoubleElementField = record("element", field("element", primitive(Schema.Type.DOUBLE)));
    Schema doubleElementRecord = record("list", field("element", structWithDoubleElementField));
    Schema oldDoubleSchema = record("ListOfSingleElementStructsWithElementField", optionalField("list_of_structs", array(doubleElementRecord)));
    GenericRecord oldDoubleRecord = instance(oldDoubleSchema, "list_of_structs", Arrays.asList(instance(doubleElementRecord, "element", instance(structWithDoubleElementField, "element", 33.0)), instance(doubleElementRecord, "element", instance(structWithDoubleElementField, "element", 34.0))));
    assertReaderContains(oldBehaviorReader(test, oldDoubleSchema), oldDoubleSchema, oldDoubleRecord);
    Schema newDoubleSchema = record("ListOfSingleElementStructsWithElementField", optionalField("list_of_structs", array(structWithDoubleElementField)));
    GenericRecord newDoubleRecord = instance(newDoubleSchema, "list_of_structs", Arrays.asList(instance(structWithDoubleElementField, "element", 33.0), instance(structWithDoubleElementField, "element", 34.0)));
    assertReaderContains(newBehaviorReader(test, newDoubleSchema), newDoubleSchema, newDoubleRecord);
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) Schema(org.apache.avro.Schema) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) RecordConsumer(org.apache.parquet.io.api.RecordConsumer) GenericRecord(org.apache.avro.generic.GenericRecord) MessageType(org.apache.parquet.schema.MessageType) Test(org.junit.Test) DirectWriterTest(org.apache.parquet.DirectWriterTest)

Example 3 with ParquetFileReader

use of org.apache.parquet.hadoop.ParquetFileReader in project parquet-mr by apache.

the class ShowDictionaryCommand method run.

@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
    Preconditions.checkArgument(targets != null && targets.size() >= 1, "A Parquet file is required.");
    Preconditions.checkArgument(targets.size() == 1, "Cannot process multiple Parquet files.");
    String source = targets.get(0);
    ParquetFileReader reader = ParquetFileReader.open(getConf(), qualifiedPath(source));
    MessageType schema = reader.getFileMetaData().getSchema();
    ColumnDescriptor descriptor = Util.descriptor(column, schema);
    PrimitiveType type = Util.primitive(column, schema);
    Preconditions.checkNotNull(type);
    DictionaryPageReadStore dictionaryReader;
    int rowGroup = 0;
    while ((dictionaryReader = reader.getNextDictionaryReader()) != null) {
        DictionaryPage page = dictionaryReader.readDictionaryPage(descriptor);
        Dictionary dict = page.getEncoding().initDictionary(descriptor, page);
        console.info("\nRow group {} dictionary for \"{}\":", rowGroup, column, page.getCompressedSize());
        for (int i = 0; i <= dict.getMaxId(); i += 1) {
            switch(type.getPrimitiveTypeName()) {
                case BINARY:
                    if (type.getOriginalType() == OriginalType.UTF8) {
                        console.info("{}: {}", String.format("%6d", i), Util.humanReadable(dict.decodeToBinary(i).toStringUsingUTF8(), 70));
                    } else {
                        console.info("{}: {}", String.format("%6d", i), Util.humanReadable(dict.decodeToBinary(i).getBytesUnsafe(), 70));
                    }
                    break;
                case INT32:
                    console.info("{}: {}", String.format("%6d", i), dict.decodeToInt(i));
                    break;
                case INT64:
                    console.info("{}: {}", String.format("%6d", i), dict.decodeToLong(i));
                    break;
                case FLOAT:
                    console.info("{}: {}", String.format("%6d", i), dict.decodeToFloat(i));
                    break;
                case DOUBLE:
                    console.info("{}: {}", String.format("%6d", i), dict.decodeToDouble(i));
                    break;
                default:
                    throw new IllegalArgumentException("Unknown dictionary type: " + type.getPrimitiveTypeName());
            }
        }
        reader.skipNextRowGroup();
        rowGroup += 1;
    }
    console.info("");
    return 0;
}
Also used : Dictionary(org.apache.parquet.column.Dictionary) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) PrimitiveType(org.apache.parquet.schema.PrimitiveType) DictionaryPageReadStore(org.apache.parquet.column.page.DictionaryPageReadStore) MessageType(org.apache.parquet.schema.MessageType) DictionaryPage(org.apache.parquet.column.page.DictionaryPage)

Example 4 with ParquetFileReader

use of org.apache.parquet.hadoop.ParquetFileReader in project parquet-mr by apache.

the class ShowPagesCommand method run.

@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
    Preconditions.checkArgument(targets != null && targets.size() >= 1, "A Parquet file is required.");
    Preconditions.checkArgument(targets.size() == 1, "Cannot process multiple Parquet files.");
    String source = targets.get(0);
    ParquetFileReader reader = ParquetFileReader.open(getConf(), qualifiedPath(source));
    MessageType schema = reader.getFileMetaData().getSchema();
    Map<ColumnDescriptor, PrimitiveType> columns = Maps.newLinkedHashMap();
    if (this.columns == null || this.columns.isEmpty()) {
        for (ColumnDescriptor descriptor : schema.getColumns()) {
            columns.put(descriptor, primitive(schema, descriptor.getPath()));
        }
    } else {
        for (String column : this.columns) {
            columns.put(descriptor(column, schema), primitive(column, schema));
        }
    }
    CompressionCodecName codec = reader.getRowGroups().get(0).getColumns().get(0).getCodec();
    // accumulate formatted lines to print by column
    Map<String, List<String>> formatted = Maps.newLinkedHashMap();
    PageFormatter formatter = new PageFormatter();
    PageReadStore pageStore;
    int rowGroupNum = 0;
    while ((pageStore = reader.readNextRowGroup()) != null) {
        for (ColumnDescriptor descriptor : columns.keySet()) {
            List<String> lines = formatted.get(columnName(descriptor));
            if (lines == null) {
                lines = Lists.newArrayList();
                formatted.put(columnName(descriptor), lines);
            }
            formatter.setContext(rowGroupNum, columns.get(descriptor), codec);
            PageReader pages = pageStore.getPageReader(descriptor);
            DictionaryPage dict = pages.readDictionaryPage();
            if (dict != null) {
                lines.add(formatter.format(dict));
            }
            DataPage page;
            while ((page = pages.readPage()) != null) {
                lines.add(formatter.format(page));
            }
        }
        rowGroupNum += 1;
    }
    // TODO: Show total column size and overall size per value in the column summary line
    for (String columnName : formatted.keySet()) {
        console.info(String.format("\nColumn: %s\n%s", columnName, StringUtils.leftPad("", 80, '-')));
        console.info(formatter.getHeader());
        for (String line : formatted.get(columnName)) {
            console.info(line);
        }
        console.info("");
    }
    return 0;
}
Also used : DataPage(org.apache.parquet.column.page.DataPage) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) PageReader(org.apache.parquet.column.page.PageReader) Util.minMaxAsString(org.apache.parquet.cli.Util.minMaxAsString) Util.encodingAsString(org.apache.parquet.cli.Util.encodingAsString) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) PageReadStore(org.apache.parquet.column.page.PageReadStore) PrimitiveType(org.apache.parquet.schema.PrimitiveType) List(java.util.List) MessageType(org.apache.parquet.schema.MessageType) DictionaryPage(org.apache.parquet.column.page.DictionaryPage)

Example 5 with ParquetFileReader

use of org.apache.parquet.hadoop.ParquetFileReader in project parquet-mr by apache.

the class SchemaCommand method getParquetSchema.

private String getParquetSchema(String source) throws IOException {
    Formats.Format format;
    try (SeekableInput in = openSeekable(source)) {
        format = Formats.detectFormat((InputStream) in);
        in.seek(0);
        switch(format) {
            case PARQUET:
                return new ParquetFileReader(getConf(), qualifiedPath(source), ParquetMetadataConverter.NO_FILTER).getFileMetaData().getSchema().toString();
            default:
                throw new IllegalArgumentException(String.format("Could not get a Parquet schema for format %s: %s", format, source));
        }
    }
}
Also used : InputStream(java.io.InputStream) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) SeekableInput(org.apache.avro.file.SeekableInput) Formats(org.apache.parquet.cli.util.Formats)

Aggregations

ParquetFileReader (org.apache.parquet.hadoop.ParquetFileReader)19 MessageType (org.apache.parquet.schema.MessageType)13 Configuration (org.apache.hadoop.conf.Configuration)11 Path (org.apache.hadoop.fs.Path)11 PageReadStore (org.apache.parquet.column.page.PageReadStore)10 IOException (java.io.IOException)9 SimpleGroup (org.apache.parquet.example.data.simple.SimpleGroup)7 GroupRecordConverter (org.apache.parquet.example.data.simple.convert.GroupRecordConverter)7 ColumnIOFactory (org.apache.parquet.io.ColumnIOFactory)7 MessageColumnIO (org.apache.parquet.io.MessageColumnIO)7 RecordReader (org.apache.parquet.io.RecordReader)7 ParquetMetadata (org.apache.parquet.hadoop.metadata.ParquetMetadata)6 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)5 Vector (org.apache.ignite.ml.math.primitives.vector.Vector)4 DenseVector (org.apache.ignite.ml.math.primitives.vector.impl.DenseVector)4 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)4 ArrayList (java.util.ArrayList)3 TreeMap (java.util.TreeMap)3 NodeData (org.apache.ignite.ml.tree.NodeData)3 HashSet (java.util.HashSet)2