Search in sources :

Example 21 with SimpleGroupFactory

use of org.apache.parquet.example.data.simple.SimpleGroupFactory in project parquet-mr by apache.

the class DictionaryFilterTest method prepareFile.

@BeforeClass
public static void prepareFile() throws IOException {
    cleanup();
    GroupWriteSupport.setSchema(schema, conf);
    SimpleGroupFactory f = new SimpleGroupFactory(schema);
    ParquetWriter<Group> writer = ExampleParquetWriter.builder(file).withWriterVersion(PARQUET_1_0).withCompressionCodec(GZIP).withRowGroupSize(1024 * 1024).withPageSize(1024).enableDictionaryEncoding().withDictionaryPageSize(2 * 1024).withConf(conf).build();
    writeData(f, writer);
}
Also used : Group(org.apache.parquet.example.data.Group) SimpleGroupFactory(org.apache.parquet.example.data.simple.SimpleGroupFactory) BeforeClass(org.junit.BeforeClass)

Example 22 with SimpleGroupFactory

use of org.apache.parquet.example.data.simple.SimpleGroupFactory in project drill by axbaretto.

the class ParquetSimpleTestFileGenerator method main.

public static void main(String[] args) throws IOException {
    SimpleGroupFactory sgf = new SimpleGroupFactory(simpleSchema);
    GroupFactory gf = new SimpleGroupFactory(complexSchema);
    SimpleGroupFactory sngf = new SimpleGroupFactory(simpleNullableSchema);
    GroupFactory ngf = new SimpleGroupFactory(complexNullableSchema);
    ParquetWriter<Group> simpleWriter = initWriter(simpleSchema, "drill/parquet_test_file_simple");
    ParquetWriter<Group> complexWriter = initWriter(complexSchema, "drill/parquet_test_file_complex");
    ParquetWriter<Group> simpleNullableWriter = initWriter(simpleNullableSchema, "drill/parquet_test_file_simple_nullable");
    ParquetWriter<Group> complexNullableWriter = initWriter(complexNullableSchema, "drill/parquet_test_file_complex_nullable");
    ParquetSimpleTestFileGenerator.writeSimpleValues(sgf, simpleWriter, false);
    ParquetSimpleTestFileGenerator.writeSimpleValues(sngf, simpleNullableWriter, true);
    ParquetSimpleTestFileGenerator.writeComplexValues(gf, complexWriter, false);
    ParquetSimpleTestFileGenerator.writeComplexValues(ngf, complexNullableWriter, true);
    simpleWriter.close();
    complexWriter.close();
    simpleNullableWriter.close();
    complexNullableWriter.close();
}
Also used : Group(org.apache.parquet.example.data.Group) GroupFactory(org.apache.parquet.example.data.GroupFactory) SimpleGroupFactory(org.apache.parquet.example.data.simple.SimpleGroupFactory) SimpleGroupFactory(org.apache.parquet.example.data.simple.SimpleGroupFactory)

Example 23 with SimpleGroupFactory

use of org.apache.parquet.example.data.simple.SimpleGroupFactory in project hive by apache.

the class TestVectorizedMapColumnReader method writeMapData.

protected static void writeMapData(ParquetWriter<Group> writer, boolean isDictionaryEncoding, int elementNum) throws IOException {
    SimpleGroupFactory f = new SimpleGroupFactory(schema);
    int mapMaxSize = 4;
    int mapElementIndex = 0;
    for (int i = 0; i < elementNum; i++) {
        boolean isNull = isNull(i);
        Group group = f.newGroup();
        int mapSize = i % mapMaxSize + 1;
        if (!isNull) {
            // the map_field is to test multiple level map definition
            Group multipleLevelGroup = group.addGroup("map_field");
            for (int j = 0; j < mapSize; j++) {
                int intValForMap = getIntValue(isDictionaryEncoding, mapElementIndex);
                long longValForMap = getLongValue(isDictionaryEncoding, mapElementIndex);
                double doubleValForMap = getDoubleValue(isDictionaryEncoding, mapElementIndex);
                float floatValForMap = getFloatValue(isDictionaryEncoding, mapElementIndex);
                Binary binaryValForMap = getBinaryValue(isDictionaryEncoding, mapElementIndex);
                HiveDecimal hd = getDecimal(isDictionaryEncoding, mapElementIndex).setScale(2);
                HiveDecimalWritable hdw = new HiveDecimalWritable(hd);
                Binary decimalValForMap = Binary.fromConstantByteArray(hdw.getInternalStorage());
                group.addGroup("map_int32").append("key", intValForMap).append("value", intValForMap);
                group.addGroup("map_int64").append("key", longValForMap).append("value", longValForMap);
                group.addGroup("map_double").append("key", doubleValForMap).append("value", doubleValForMap);
                group.addGroup("map_float").append("key", floatValForMap).append("value", floatValForMap);
                group.addGroup("map_binary").append("key", binaryValForMap).append("value", binaryValForMap);
                group.addGroup("map_decimal").append("key", decimalValForMap).append("value", decimalValForMap);
                multipleLevelGroup.addGroup("map").append("key", binaryValForMap).append("value", binaryValForMap);
                mapElementIndex++;
            }
        }
        writer.write(group);
    }
    writer.close();
}
Also used : Group(org.apache.parquet.example.data.Group) HiveDecimalWritable(org.apache.hadoop.hive.serde2.io.HiveDecimalWritable) HiveDecimal(org.apache.hadoop.hive.common.type.HiveDecimal) SimpleGroupFactory(org.apache.parquet.example.data.simple.SimpleGroupFactory) Binary(org.apache.parquet.io.api.Binary)

Example 24 with SimpleGroupFactory

use of org.apache.parquet.example.data.simple.SimpleGroupFactory in project hive by apache.

the class TestVectorizedMapColumnReader method writeRepeateMapData.

protected static void writeRepeateMapData(ParquetWriter<Group> writer, int elementNum, boolean isNull) throws IOException {
    SimpleGroupFactory f = new SimpleGroupFactory(schema);
    int mapMaxSize = 4;
    for (int i = 0; i < elementNum; i++) {
        Group group = f.newGroup();
        if (!isNull) {
            for (int j = 0; j < mapMaxSize; j++) {
                group.addGroup("map_int32_for_repeat_test").append("key", j).append("value", j);
            }
        }
        writer.write(group);
    }
    writer.close();
}
Also used : Group(org.apache.parquet.example.data.Group) SimpleGroupFactory(org.apache.parquet.example.data.simple.SimpleGroupFactory)

Example 25 with SimpleGroupFactory

use of org.apache.parquet.example.data.simple.SimpleGroupFactory in project presto by prestodb.

the class ParquetTester method nonHiveParquetWriter.

private static void nonHiveParquetWriter(JobConf jobConf, File outputFile, org.apache.parquet.hadoop.metadata.CompressionCodecName compressionCodecName, SettableStructObjectInspector objectInspector, Iterator<?>[] valuesByField, org.apache.parquet.schema.MessageType parquetSchema) throws Exception {
    GroupWriteSupport.setSchema(parquetSchema, jobConf);
    org.apache.parquet.hadoop.ParquetWriter writer = ExampleParquetWriter.builder(new Path(outputFile.toURI())).withType(parquetSchema).withCompressionCodec(compressionCodecName).withConf(jobConf).withDictionaryEncoding(true).build();
    List<StructField> fields = ImmutableList.copyOf(objectInspector.getAllStructFieldRefs());
    SimpleGroupFactory groupFactory = new SimpleGroupFactory(parquetSchema);
    while (stream(valuesByField).allMatch(Iterator::hasNext)) {
        Group group = groupFactory.newGroup();
        for (int field = 0; field < fields.size(); field++) {
            Object value = valuesByField[field].next();
            if (value == null) {
                continue;
            }
            String fieldName = fields.get(field).getFieldName();
            String typeName = fields.get(field).getFieldObjectInspector().getTypeName();
            switch(typeName) {
                case "timestamp":
                case "bigint":
                    group.add(fieldName, (long) value);
                    break;
                default:
                    throw new RuntimeException(String.format("unhandled type for column %s type %s", fieldName, typeName));
            }
        }
        writer.write(group);
    }
    writer.close();
}
Also used : Path(org.apache.hadoop.fs.Path) Group(org.apache.parquet.example.data.Group) SimpleGroupFactory(org.apache.parquet.example.data.simple.SimpleGroupFactory) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) AbstractIterator(com.google.common.collect.AbstractIterator) Iterator(java.util.Iterator)

Aggregations

SimpleGroupFactory (org.apache.parquet.example.data.simple.SimpleGroupFactory)37 Group (org.apache.parquet.example.data.Group)33 MessageType (org.apache.parquet.schema.MessageType)20 Path (org.apache.hadoop.fs.Path)16 Configuration (org.apache.hadoop.conf.Configuration)13 Test (org.junit.Test)12 File (java.io.File)10 GroupFactory (org.apache.parquet.example.data.GroupFactory)10 MessageTypeParser.parseMessageType (org.apache.parquet.schema.MessageTypeParser.parseMessageType)8 GroupWriteSupport (org.apache.parquet.hadoop.example.GroupWriteSupport)7 ParquetWriter (org.apache.parquet.hadoop.ParquetWriter)6 PrimitiveType (org.apache.parquet.schema.PrimitiveType)5 MemPageStore (org.apache.parquet.column.page.mem.MemPageStore)4 HadoopInputFile (org.apache.parquet.hadoop.util.HadoopInputFile)4 HadoopOutputFile (org.apache.parquet.hadoop.util.HadoopOutputFile)4 OutputFile (org.apache.parquet.io.OutputFile)4 Binary (org.apache.parquet.io.api.Binary)4 ArrayList (java.util.ArrayList)3 HashMap (java.util.HashMap)3 Random (java.util.Random)3