Search in sources :

Example 56 with PrimitiveType

use of org.apache.parquet.schema.PrimitiveType in project parquet-mr by apache.

the class SchemaControlEncryptionTest method encryptParquetFile.

private String encryptParquetFile(String file, Configuration conf) throws IOException {
    MessageType schema = new MessageType("schema", new PrimitiveType(REQUIRED, BINARY, "Name"), new PrimitiveType(REQUIRED, INT64, "Age"), new GroupType(OPTIONAL, "WebLinks", new PrimitiveType(REPEATED, BINARY, "LinkedIn"), new PrimitiveType(REPEATED, BINARY, "Twitter")));
    conf.set(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, schema.toString());
    Path path = new Path(file);
    Builder builder = new Builder(path);
    builder.withConf(conf);
    try (ParquetWriter writer = builder.build()) {
        for (int i = 0; i < 1000; i++) {
            SimpleGroup g = new SimpleGroup(schema);
            g.add("Name", (String) testData.get("Name")[i]);
            g.add("Age", (Long) testData.get("Age")[i]);
            Group links = g.addGroup("WebLinks");
            links.add(0, (String) testData.get("LinkedIn")[i]);
            links.add(1, (String) testData.get("Twitter")[i]);
            writer.write(g);
        }
    }
    return file;
}
Also used : Path(org.apache.hadoop.fs.Path) Group(org.apache.parquet.example.data.Group) SimpleGroup(org.apache.parquet.example.data.simple.SimpleGroup) GroupType(org.apache.parquet.schema.GroupType) ParquetWriter(org.apache.parquet.hadoop.ParquetWriter) SimpleGroup(org.apache.parquet.example.data.simple.SimpleGroup) PrimitiveType(org.apache.parquet.schema.PrimitiveType) MessageType(org.apache.parquet.schema.MessageType)

Example 57 with PrimitiveType

use of org.apache.parquet.schema.PrimitiveType in project parquet-mr by apache.

the class ShowDictionaryCommand method run.

@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
    Preconditions.checkArgument(targets != null && targets.size() >= 1, "A Parquet file is required.");
    Preconditions.checkArgument(targets.size() == 1, "Cannot process multiple Parquet files.");
    String source = targets.get(0);
    try (ParquetFileReader reader = ParquetFileReader.open(getConf(), qualifiedPath(source))) {
        MessageType schema = reader.getFileMetaData().getSchema();
        ColumnDescriptor descriptor = Util.descriptor(column, schema);
        PrimitiveType type = Util.primitive(column, schema);
        Preconditions.checkNotNull(type);
        DictionaryPageReadStore dictionaryReader;
        int rowGroup = 0;
        while ((dictionaryReader = reader.getNextDictionaryReader()) != null) {
            DictionaryPage page = dictionaryReader.readDictionaryPage(descriptor);
            if (page != null) {
                console.info("\nRow group {} dictionary for \"{}\":", rowGroup, column);
                Dictionary dict = page.getEncoding().initDictionary(descriptor, page);
                printDictionary(dict, type);
            } else {
                console.info("\nRow group {} has no dictionary for \"{}\"", rowGroup, column);
            }
            reader.skipNextRowGroup();
            rowGroup += 1;
        }
    }
    console.info("");
    return 0;
}
Also used : Dictionary(org.apache.parquet.column.Dictionary) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) PrimitiveType(org.apache.parquet.schema.PrimitiveType) DictionaryPageReadStore(org.apache.parquet.column.page.DictionaryPageReadStore) MessageType(org.apache.parquet.schema.MessageType) DictionaryPage(org.apache.parquet.column.page.DictionaryPage)

Example 58 with PrimitiveType

use of org.apache.parquet.schema.PrimitiveType in project parquet-mr by apache.

the class ShowPagesCommand method run.

@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
    Preconditions.checkArgument(targets != null && targets.size() >= 1, "A Parquet file is required.");
    Preconditions.checkArgument(targets.size() == 1, "Cannot process multiple Parquet files.");
    String source = targets.get(0);
    try (ParquetFileReader reader = ParquetFileReader.open(getConf(), qualifiedPath(source))) {
        MessageType schema = reader.getFileMetaData().getSchema();
        Map<ColumnDescriptor, PrimitiveType> columns = Maps.newLinkedHashMap();
        if (this.columns == null || this.columns.isEmpty()) {
            for (ColumnDescriptor descriptor : schema.getColumns()) {
                columns.put(descriptor, primitive(schema, descriptor.getPath()));
            }
        } else {
            for (String column : this.columns) {
                columns.put(descriptor(column, schema), primitive(column, schema));
            }
        }
        CompressionCodecName codec = reader.getRowGroups().get(0).getColumns().get(0).getCodec();
        // accumulate formatted lines to print by column
        Map<String, List<String>> formatted = Maps.newLinkedHashMap();
        PageFormatter formatter = new PageFormatter();
        PageReadStore pageStore;
        int rowGroupNum = 0;
        while ((pageStore = reader.readNextRowGroup()) != null) {
            for (ColumnDescriptor descriptor : columns.keySet()) {
                List<String> lines = formatted.get(columnName(descriptor));
                if (lines == null) {
                    lines = Lists.newArrayList();
                    formatted.put(columnName(descriptor), lines);
                }
                formatter.setContext(rowGroupNum, columns.get(descriptor), codec);
                PageReader pages = pageStore.getPageReader(descriptor);
                DictionaryPage dict = pages.readDictionaryPage();
                if (dict != null) {
                    lines.add(formatter.format(dict));
                }
                DataPage page;
                while ((page = pages.readPage()) != null) {
                    lines.add(formatter.format(page));
                }
            }
            rowGroupNum += 1;
        }
        // TODO: Show total column size and overall size per value in the column summary line
        for (String columnName : formatted.keySet()) {
            console.info(String.format("\nColumn: %s\n%s", columnName, new TextStringBuilder(80).appendPadding(80, '-')));
            console.info(formatter.getHeader());
            for (String line : formatted.get(columnName)) {
                console.info(line);
            }
            console.info("");
        }
    }
    return 0;
}
Also used : DataPage(org.apache.parquet.column.page.DataPage) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) PageReader(org.apache.parquet.column.page.PageReader) Util.minMaxAsString(org.apache.parquet.cli.Util.minMaxAsString) Util.encodingAsString(org.apache.parquet.cli.Util.encodingAsString) TextStringBuilder(org.apache.commons.text.TextStringBuilder) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) PageReadStore(org.apache.parquet.column.page.PageReadStore) PrimitiveType(org.apache.parquet.schema.PrimitiveType) List(java.util.List) MessageType(org.apache.parquet.schema.MessageType) DictionaryPage(org.apache.parquet.column.page.DictionaryPage)

Example 59 with PrimitiveType

use of org.apache.parquet.schema.PrimitiveType in project parquet-mr by apache.

the class TestAvroSchemaConverter method testTimestampMillisType.

@Test
public void testTimestampMillisType() throws Exception {
    Schema date = LogicalTypes.timestampMillis().addToSchema(Schema.create(LONG));
    Schema expected = Schema.createRecord("myrecord", null, null, false, Arrays.asList(new Schema.Field("timestamp", date, null, null)));
    testRoundTripConversion(expected, "message myrecord {\n" + "  required int64 timestamp (TIMESTAMP(MILLIS,true));\n" + "}\n");
    for (PrimitiveTypeName primitive : new PrimitiveTypeName[] { INT32, INT96, FLOAT, DOUBLE, BOOLEAN, BINARY, FIXED_LEN_BYTE_ARRAY }) {
        final PrimitiveType type;
        if (primitive == FIXED_LEN_BYTE_ARRAY) {
            type = new PrimitiveType(REQUIRED, primitive, 12, "test", TIMESTAMP_MILLIS);
        } else {
            type = new PrimitiveType(REQUIRED, primitive, "test", TIMESTAMP_MILLIS);
        }
        assertThrows("Should not allow TIMESTAMP_MILLIS with " + primitive, IllegalArgumentException.class, () -> new AvroSchemaConverter().convert(message(type)));
    }
}
Also used : AvroTestUtil.optionalField(org.apache.parquet.avro.AvroTestUtil.optionalField) Schema(org.apache.avro.Schema) PrimitiveType(org.apache.parquet.schema.PrimitiveType) PrimitiveTypeName(org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName) Test(org.junit.Test)

Example 60 with PrimitiveType

use of org.apache.parquet.schema.PrimitiveType in project parquet-mr by apache.

the class TestAvroSchemaConverter method testTimestampMicrosType.

@Test
public void testTimestampMicrosType() throws Exception {
    Schema date = LogicalTypes.timestampMicros().addToSchema(Schema.create(LONG));
    Schema expected = Schema.createRecord("myrecord", null, null, false, Arrays.asList(new Schema.Field("timestamp", date, null, null)));
    testRoundTripConversion(expected, "message myrecord {\n" + "  required int64 timestamp (TIMESTAMP(MICROS,true));\n" + "}\n");
    for (PrimitiveTypeName primitive : new PrimitiveTypeName[] { INT32, INT96, FLOAT, DOUBLE, BOOLEAN, BINARY, FIXED_LEN_BYTE_ARRAY }) {
        final PrimitiveType type;
        if (primitive == FIXED_LEN_BYTE_ARRAY) {
            type = new PrimitiveType(REQUIRED, primitive, 12, "test", TIMESTAMP_MICROS);
        } else {
            type = new PrimitiveType(REQUIRED, primitive, "test", TIMESTAMP_MICROS);
        }
        assertThrows("Should not allow TIMESTAMP_MICROS with " + primitive, IllegalArgumentException.class, () -> new AvroSchemaConverter().convert(message(type)));
    }
}
Also used : AvroTestUtil.optionalField(org.apache.parquet.avro.AvroTestUtil.optionalField) Schema(org.apache.avro.Schema) PrimitiveType(org.apache.parquet.schema.PrimitiveType) PrimitiveTypeName(org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName) Test(org.junit.Test)

Aggregations

PrimitiveType (org.apache.parquet.schema.PrimitiveType)123 Test (org.junit.Test)66 MessageType (org.apache.parquet.schema.MessageType)41 HiveDecimalWritable (org.apache.hadoop.hive.serde2.io.HiveDecimalWritable)28 BooleanWritable (org.apache.hadoop.io.BooleanWritable)28 BytesWritable (org.apache.hadoop.io.BytesWritable)28 DoubleWritable (org.apache.hadoop.io.DoubleWritable)28 FloatWritable (org.apache.hadoop.io.FloatWritable)28 IntWritable (org.apache.hadoop.io.IntWritable)28 LongWritable (org.apache.hadoop.io.LongWritable)28 Writable (org.apache.hadoop.io.Writable)28 GroupType (org.apache.parquet.schema.GroupType)25 Test (org.testng.annotations.Test)20 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)14 OriginalType (org.apache.parquet.schema.OriginalType)14 Type (org.apache.parquet.schema.Type)12 List (java.util.List)11 ColumnIndex (org.apache.parquet.internal.column.columnindex.ColumnIndex)11 ColumnIndexBuilder (org.apache.parquet.internal.column.columnindex.ColumnIndexBuilder)11 ArrayList (java.util.ArrayList)10