Search in sources :

Example 1 with Group

use of org.apache.parquet.example.data.Group in project hive by apache.

the class VectorizedColumnReaderTestBase method writeData.

protected static void writeData(ParquetWriter<Group> writer, boolean isDictionaryEncoding) throws IOException {
    SimpleGroupFactory f = new SimpleGroupFactory(schema);
    for (int i = 0; i < nElements; i++) {
        boolean isNull = isNull(i);
        int intVal = getIntValue(isDictionaryEncoding, i);
        long longVal = getLongValue(isDictionaryEncoding, i);
        Binary timeStamp = getTimestamp(isDictionaryEncoding, i);
        HiveDecimal decimalVal = getDecimal(isDictionaryEncoding, i).setScale(2);
        double doubleVal = getDoubleValue(isDictionaryEncoding, i);
        float floatVal = getFloatValue(isDictionaryEncoding, i);
        boolean booleanVal = getBooleanValue(i);
        Binary binary = getBinaryValue(isDictionaryEncoding, i);
        Group group = f.newGroup().append("int32_field", intVal).append("int64_field", longVal).append("int96_field", timeStamp).append("double_field", doubleVal).append("float_field", floatVal).append("boolean_field", booleanVal).append("flba_field", "abc");
        if (!isNull) {
            group.append("some_null_field", "x");
        }
        group.append("binary_field", binary);
        if (!isNull) {
            group.append("binary_field_some_null", binary);
        }
        HiveDecimalWritable w = new HiveDecimalWritable(decimalVal);
        group.append("value", Binary.fromConstantByteArray(w.getInternalStorage()));
        group.addGroup("struct_field").append("a", intVal).append("b", doubleVal);
        Group g = group.addGroup("nested_struct_field");
        g.addGroup("nsf").append("c", intVal).append("d", intVal);
        g.append("e", doubleVal);
        Group some_null_g = group.addGroup("struct_field_some_null");
        if (i % 2 != 0) {
            some_null_g.append("f", intVal);
        }
        if (i % 3 != 0) {
            some_null_g.append("g", doubleVal);
        }
        Group mapGroup = group.addGroup("map_field");
        if (i % 13 != 1) {
            mapGroup.addGroup("map").append("key", binary).append("value", "abc");
        } else {
            mapGroup.addGroup("map").append("key", binary);
        }
        Group arrayGroup = group.addGroup("array_list");
        for (int j = 0; j < i % 4; j++) {
            arrayGroup.addGroup("bag").append("array_element", intVal);
        }
        writer.write(group);
    }
    writer.close();
}
Also used : Group(org.apache.parquet.example.data.Group) HiveDecimalWritable(org.apache.hadoop.hive.serde2.io.HiveDecimalWritable) HiveDecimal(org.apache.hadoop.hive.common.type.HiveDecimal) SimpleGroupFactory(org.apache.parquet.example.data.simple.SimpleGroupFactory) Binary(org.apache.parquet.io.api.Binary)

Example 2 with Group

use of org.apache.parquet.example.data.Group in project h2o-3 by h2oai.

the class ParquetFileGenerator method generateParquetFile.

static File generateParquetFile(File parentDir, String filename, int nrows, Date date) throws IOException {
    File f = new File(parentDir, filename);
    Configuration conf = new Configuration();
    MessageType schema = parseMessageType("message test { " + "required int32 int32_field; " + "required int64 int64_field; " + "required float float_field; " + "required double double_field; " + "required int64 timestamp_field (TIMESTAMP_MILLIS);" + "} ");
    GroupWriteSupport.setSchema(schema, conf);
    SimpleGroupFactory fact = new SimpleGroupFactory(schema);
    ParquetWriter<Group> writer = new ParquetWriter<Group>(new Path(f.getPath()), new GroupWriteSupport(), UNCOMPRESSED, 1024, 1024, 512, true, false, ParquetProperties.WriterVersion.PARQUET_2_0, conf);
    try {
        for (int i = 0; i < nrows; i++) {
            writer.write(fact.newGroup().append("int32_field", 32 + i).append("int64_field", 64L + i).append("float_field", 1.0f + i).append("double_field", 2.0d + i).append("timestamp_field", date.getTime() + (i * 117)));
        }
    } finally {
        writer.close();
    }
    return f;
}
Also used : Path(org.apache.hadoop.fs.Path) GroupWriteSupport(org.apache.parquet.hadoop.example.GroupWriteSupport) Group(org.apache.parquet.example.data.Group) Configuration(org.apache.hadoop.conf.Configuration) ParquetWriter(org.apache.parquet.hadoop.ParquetWriter) AvroParquetWriter(org.apache.parquet.avro.AvroParquetWriter) SimpleGroupFactory(org.apache.parquet.example.data.simple.SimpleGroupFactory) File(java.io.File) MessageTypeParser.parseMessageType(org.apache.parquet.schema.MessageTypeParser.parseMessageType) MessageType(org.apache.parquet.schema.MessageType)

Example 3 with Group

use of org.apache.parquet.example.data.Group in project h2o-3 by h2oai.

the class ParquetFileGenerator method generateSparseParquetFile.

static File generateSparseParquetFile(File parentDir, String filename, int nrows) throws IOException {
    File f = new File(parentDir, filename);
    Configuration conf = new Configuration();
    MessageType schema = parseMessageType("message test { optional int32 int32_field; optional binary string_field (UTF8); required int32 row; optional int32 int32_field2; } ");
    GroupWriteSupport.setSchema(schema, conf);
    SimpleGroupFactory fact = new SimpleGroupFactory(schema);
    ParquetWriter<Group> writer = new ParquetWriter<Group>(new Path(f.getPath()), new GroupWriteSupport(), UNCOMPRESSED, 1024, 1024, 512, true, false, ParquetProperties.WriterVersion.PARQUET_2_0, conf);
    try {
        for (int i = 0; i < nrows; i++) {
            Group g = fact.newGroup();
            if (i % 10 == 0) {
                g = g.append("int32_field", i);
            }
            if (i % 10 == 0) {
                g = g.append("string_field", "CAT_" + (i % 10));
            }
            if (i % 10 == 0) {
                g = g.append("int32_field2", i);
            }
            writer.write(g.append("row", i));
        }
    } finally {
        writer.close();
    }
    return f;
}
Also used : Path(org.apache.hadoop.fs.Path) GroupWriteSupport(org.apache.parquet.hadoop.example.GroupWriteSupport) Group(org.apache.parquet.example.data.Group) Configuration(org.apache.hadoop.conf.Configuration) ParquetWriter(org.apache.parquet.hadoop.ParquetWriter) AvroParquetWriter(org.apache.parquet.avro.AvroParquetWriter) SimpleGroupFactory(org.apache.parquet.example.data.simple.SimpleGroupFactory) File(java.io.File) MessageTypeParser.parseMessageType(org.apache.parquet.schema.MessageTypeParser.parseMessageType) MessageType(org.apache.parquet.schema.MessageType)

Aggregations

Group (org.apache.parquet.example.data.Group)3 SimpleGroupFactory (org.apache.parquet.example.data.simple.SimpleGroupFactory)3 File (java.io.File)2 Configuration (org.apache.hadoop.conf.Configuration)2 Path (org.apache.hadoop.fs.Path)2 AvroParquetWriter (org.apache.parquet.avro.AvroParquetWriter)2 ParquetWriter (org.apache.parquet.hadoop.ParquetWriter)2 GroupWriteSupport (org.apache.parquet.hadoop.example.GroupWriteSupport)2 MessageType (org.apache.parquet.schema.MessageType)2 MessageTypeParser.parseMessageType (org.apache.parquet.schema.MessageTypeParser.parseMessageType)2 HiveDecimal (org.apache.hadoop.hive.common.type.HiveDecimal)1 HiveDecimalWritable (org.apache.hadoop.hive.serde2.io.HiveDecimalWritable)1 Binary (org.apache.parquet.io.api.Binary)1