Search in sources :

Example 41 with MessageType

use of org.apache.parquet.schema.MessageType in project drill by apache.

the class TestFileGenerator method generateParquetFile.

public static void generateParquetFile(String filename, ParquetTestProperties props) throws Exception {
    int currentBooleanByte = 0;
    WrapAroundCounter booleanBitCounter = new WrapAroundCounter(7);
    Configuration configuration = new Configuration();
    configuration.set(FileSystem.FS_DEFAULT_NAME_KEY, "file:///");
    //"message m { required int32 integer; required int64 integer64; required boolean b; required float f; required double d;}"
    FileSystem fs = FileSystem.get(configuration);
    Path path = new Path(filename);
    if (fs.exists(path)) {
        fs.delete(path, false);
    }
    String messageSchema = "message m {";
    for (FieldInfo fieldInfo : props.fields.values()) {
        messageSchema += " required " + fieldInfo.parquetType + " " + fieldInfo.name + ";";
    }
    // remove the last semicolon, java really needs a join method for strings...
    // TODO - nvm apparently it requires a semicolon after every field decl, might want to file a bug
    //messageSchema = messageSchema.substring(schemaType, messageSchema.length() - 1);
    messageSchema += "}";
    MessageType schema = MessageTypeParser.parseMessageType(messageSchema);
    CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;
    ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path);
    w.start();
    HashMap<String, Integer> columnValuesWritten = new HashMap<>();
    int valsWritten;
    for (int k = 0; k < props.numberRowGroups; k++) {
        w.startBlock(props.recordsPerRowGroup);
        currentBooleanByte = 0;
        booleanBitCounter.reset();
        for (FieldInfo fieldInfo : props.fields.values()) {
            if (!columnValuesWritten.containsKey(fieldInfo.name)) {
                columnValuesWritten.put(fieldInfo.name, 0);
                valsWritten = 0;
            } else {
                valsWritten = columnValuesWritten.get(fieldInfo.name);
            }
            String[] path1 = { fieldInfo.name };
            ColumnDescriptor c1 = schema.getColumnDescription(path1);
            w.startColumn(c1, props.recordsPerRowGroup, codec);
            final int valsPerPage = (int) Math.ceil(props.recordsPerRowGroup / (float) fieldInfo.numberOfPages);
            // 1 MB
            final int PAGE_SIZE = 1024 * 1024;
            byte[] bytes;
            RunLengthBitPackingHybridValuesWriter defLevels = new RunLengthBitPackingHybridValuesWriter(MAX_EXPECTED_BIT_WIDTH_FOR_DEFINITION_LEVELS, valsPerPage, PAGE_SIZE, new DirectByteBufferAllocator());
            RunLengthBitPackingHybridValuesWriter repLevels = new RunLengthBitPackingHybridValuesWriter(MAX_EXPECTED_BIT_WIDTH_FOR_DEFINITION_LEVELS, valsPerPage, PAGE_SIZE, new DirectByteBufferAllocator());
            // for variable length binary fields
            int bytesNeededToEncodeLength = 4;
            if (fieldInfo.bitLength > 0) {
                bytes = new byte[(int) Math.ceil(valsPerPage * fieldInfo.bitLength / 8.0)];
            } else {
                // the twelve at the end is to account for storing a 4 byte length with each value
                int totalValLength = ((byte[]) fieldInfo.values[0]).length + ((byte[]) fieldInfo.values[1]).length + ((byte[]) fieldInfo.values[2]).length + 3 * bytesNeededToEncodeLength;
                // used for the case where there is a number of values in this row group that is not divisible by 3
                int leftOverBytes = 0;
                if (valsPerPage % 3 > 0) {
                    leftOverBytes += ((byte[]) fieldInfo.values[1]).length + bytesNeededToEncodeLength;
                }
                if (valsPerPage % 3 > 1) {
                    leftOverBytes += ((byte[]) fieldInfo.values[2]).length + bytesNeededToEncodeLength;
                }
                bytes = new byte[valsPerPage / 3 * totalValLength + leftOverBytes];
            }
            int bytesPerPage = (int) (valsPerPage * (fieldInfo.bitLength / 8.0));
            int bytesWritten = 0;
            for (int z = 0; z < fieldInfo.numberOfPages; z++, bytesWritten = 0) {
                for (int i = 0; i < valsPerPage; i++) {
                    repLevels.writeInteger(0);
                    defLevels.writeInteger(1);
                    //System.out.print(i + ", " + (i % 25 == 0 ? "\n gen " + fieldInfo.name + ": " : ""));
                    if (fieldInfo.values[0] instanceof Boolean) {
                        bytes[currentBooleanByte] |= bitFields[booleanBitCounter.val] & ((boolean) fieldInfo.values[valsWritten % 3] ? allBitsTrue : allBitsFalse);
                        booleanBitCounter.increment();
                        if (booleanBitCounter.val == 0) {
                            currentBooleanByte++;
                        }
                        valsWritten++;
                        if (currentBooleanByte > bytesPerPage) {
                            break;
                        }
                    } else {
                        if (fieldInfo.values[valsWritten % 3] instanceof byte[]) {
                            System.arraycopy(ByteArrayUtil.toByta(((byte[]) fieldInfo.values[valsWritten % 3]).length), 0, bytes, bytesWritten, bytesNeededToEncodeLength);
                            System.arraycopy(fieldInfo.values[valsWritten % 3], 0, bytes, bytesWritten + bytesNeededToEncodeLength, ((byte[]) fieldInfo.values[valsWritten % 3]).length);
                            bytesWritten += ((byte[]) fieldInfo.values[valsWritten % 3]).length + bytesNeededToEncodeLength;
                        } else {
                            System.arraycopy(ByteArrayUtil.toByta(fieldInfo.values[valsWritten % 3]), 0, bytes, i * (fieldInfo.bitLength / 8), fieldInfo.bitLength / 8);
                        }
                        valsWritten++;
                    }
                }
                byte[] fullPage = new byte[2 * 4 * valsPerPage + bytes.length];
                byte[] repLevelBytes = repLevels.getBytes().toByteArray();
                byte[] defLevelBytes = defLevels.getBytes().toByteArray();
                System.arraycopy(bytes, 0, fullPage, 0, bytes.length);
                System.arraycopy(repLevelBytes, 0, fullPage, bytes.length, repLevelBytes.length);
                System.arraycopy(defLevelBytes, 0, fullPage, bytes.length + repLevelBytes.length, defLevelBytes.length);
                w.writeDataPage((props.recordsPerRowGroup / fieldInfo.numberOfPages), fullPage.length, BytesInput.from(fullPage), RLE, RLE, PLAIN);
                currentBooleanByte = 0;
            }
            w.endColumn();
            columnValuesWritten.remove(fieldInfo.name);
            columnValuesWritten.put(fieldInfo.name, valsWritten);
        }
        w.endBlock();
    }
    w.end(new HashMap<String, String>());
    logger.debug("Finished generating parquet file {}", path.getName());
}
Also used : Path(org.apache.hadoop.fs.Path) DirectByteBufferAllocator(org.apache.parquet.bytes.DirectByteBufferAllocator) Configuration(org.apache.hadoop.conf.Configuration) HashMap(java.util.HashMap) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ParquetFileWriter(org.apache.parquet.hadoop.ParquetFileWriter) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) RunLengthBitPackingHybridValuesWriter(org.apache.parquet.column.values.rle.RunLengthBitPackingHybridValuesWriter) FileSystem(org.apache.hadoop.fs.FileSystem) MessageType(org.apache.parquet.schema.MessageType)

Example 42 with MessageType

use of org.apache.parquet.schema.MessageType in project hive by apache.

the class DataWritableReadSupport method init.

/**
 * It creates the readContext for Parquet side with the requested schema during the init phase.
 *
 * @param context
 * @return the parquet ReadContext
 */
@Override
public org.apache.parquet.hadoop.api.ReadSupport.ReadContext init(InitContext context) {
    Configuration configuration = context.getConfiguration();
    MessageType fileSchema = context.getFileSchema();
    String columnNames = configuration.get(IOConstants.COLUMNS);
    Map<String, String> contextMetadata = new HashMap<String, String>();
    boolean indexAccess = configuration.getBoolean(PARQUET_COLUMN_INDEX_ACCESS, false);
    if (columnNames != null) {
        List<String> columnNamesList = getColumnNames(columnNames);
        String columnTypes = configuration.get(IOConstants.COLUMNS_TYPES);
        List<TypeInfo> columnTypesList = getColumnTypes(columnTypes);
        MessageType tableSchema = getRequestedSchemaForIndexAccess(indexAccess, columnNamesList, columnTypesList, fileSchema);
        contextMetadata.put(HIVE_TABLE_AS_PARQUET_SCHEMA, tableSchema.toString());
        contextMetadata.put(PARQUET_COLUMN_INDEX_ACCESS, String.valueOf(indexAccess));
        this.hiveTypeInfo = TypeInfoFactory.getStructTypeInfo(columnNamesList, columnTypesList);
        return new ReadContext(getRequestedPrunedSchema(columnNamesList, tableSchema, configuration), contextMetadata);
    } else {
        contextMetadata.put(HIVE_TABLE_AS_PARQUET_SCHEMA, fileSchema.toString());
        return new ReadContext(fileSchema, contextMetadata);
    }
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) HashMap(java.util.HashMap) ListTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo) StructTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) MessageType(org.apache.parquet.schema.MessageType)

Example 43 with MessageType

use of org.apache.parquet.schema.MessageType in project hive by apache.

the class TestHiveSchemaConverter method testMapOriginalType.

@Test
public void testMapOriginalType() throws Exception {
    final String hiveColumnTypes = "map<string,string>";
    final String hiveColumnNames = "mapCol";
    final List<String> columnNames = createHiveColumnsFrom(hiveColumnNames);
    final List<TypeInfo> columnTypes = createHiveTypeInfoFrom(hiveColumnTypes);
    final MessageType messageTypeFound = HiveSchemaConverter.convert(columnNames, columnTypes);
    // this messageType only has one optional field, whose name is mapCol, original Type is MAP
    assertEquals(1, messageTypeFound.getFieldCount());
    org.apache.parquet.schema.Type topLevel = messageTypeFound.getFields().get(0);
    assertEquals("mapCol", topLevel.getName());
    assertEquals(OriginalType.MAP, topLevel.getOriginalType());
    assertEquals(Repetition.OPTIONAL, topLevel.getRepetition());
    assertEquals(1, topLevel.asGroupType().getFieldCount());
    org.apache.parquet.schema.Type secondLevel = topLevel.asGroupType().getFields().get(0);
    // there is one repeated field for mapCol, the field name is "map" and its original Type is MAP_KEY_VALUE;
    assertEquals("map", secondLevel.getName());
    assertEquals(OriginalType.MAP_KEY_VALUE, secondLevel.getOriginalType());
    assertEquals(Repetition.REPEATED, secondLevel.getRepetition());
}
Also used : TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) MessageType(org.apache.parquet.schema.MessageType) Test(org.junit.Test)

Example 44 with MessageType

use of org.apache.parquet.schema.MessageType in project hive by apache.

the class TestParquetRecordReaderWrapper method testBuilderComplexTypes2.

/**
 * Check the converted filter predicate is null if unsupported types are included
 * @throws Exception
 */
@Test
public void testBuilderComplexTypes2() throws Exception {
    SearchArgument sarg = SearchArgumentFactory.newBuilder().startAnd().lessThan("x", PredicateLeaf.Type.DATE, Date.valueOf("2005-3-12")).lessThanEquals("y", PredicateLeaf.Type.STRING, new HiveChar("hi", 10).toString()).equals("z", PredicateLeaf.Type.DECIMAL, new HiveDecimalWritable("1.0")).end().build();
    MessageType schema = MessageTypeParser.parseMessageType("message test {" + " required int32 x; required binary y; required binary z;}");
    assertEquals(null, ParquetFilterPredicateConverter.toFilterPredicate(sarg, schema));
    sarg = SearchArgumentFactory.newBuilder().startNot().startOr().isNull("x", PredicateLeaf.Type.LONG).between("y", PredicateLeaf.Type.DECIMAL, new HiveDecimalWritable("10"), new HiveDecimalWritable("20.0")).in("z", PredicateLeaf.Type.LONG, 1L, 2L, 3L).nullSafeEquals("a", PredicateLeaf.Type.STRING, new HiveVarchar("stinger", 100).toString()).end().end().build();
    schema = MessageTypeParser.parseMessageType("message test {" + " optional int32 x; required binary y; required int32 z;" + " optional binary a;}");
    assertEquals(null, ParquetFilterPredicateConverter.toFilterPredicate(sarg, schema));
}
Also used : HiveDecimalWritable(org.apache.hadoop.hive.serde2.io.HiveDecimalWritable) HiveChar(org.apache.hadoop.hive.common.type.HiveChar) SearchArgument(org.apache.hadoop.hive.ql.io.sarg.SearchArgument) HiveVarchar(org.apache.hadoop.hive.common.type.HiveVarchar) MessageType(org.apache.parquet.schema.MessageType) Test(org.junit.Test)

Example 45 with MessageType

use of org.apache.parquet.schema.MessageType in project hive by apache.

the class TestParquetRecordReaderWrapper method testBuilderComplexTypes.

/**
 * Check the converted filter predicate is null if unsupported types are included
 * @throws Exception
 */
@Test
public void testBuilderComplexTypes() throws Exception {
    SearchArgument sarg = SearchArgumentFactory.newBuilder().startAnd().lessThan("x", PredicateLeaf.Type.DATE, Date.valueOf("1970-1-11")).lessThanEquals("y", PredicateLeaf.Type.STRING, new HiveChar("hi", 10).toString()).equals("z", PredicateLeaf.Type.DECIMAL, new HiveDecimalWritable("1.0")).end().build();
    MessageType schema = MessageTypeParser.parseMessageType("message test {" + " required int32 x; required binary y; required binary z;}");
    assertEquals(null, ParquetFilterPredicateConverter.toFilterPredicate(sarg, schema));
    sarg = SearchArgumentFactory.newBuilder().startNot().startOr().isNull("x", PredicateLeaf.Type.LONG).between("y", PredicateLeaf.Type.DECIMAL, new HiveDecimalWritable("10"), new HiveDecimalWritable("20.0")).in("z", PredicateLeaf.Type.LONG, 1L, 2L, 3L).nullSafeEquals("a", PredicateLeaf.Type.STRING, new HiveVarchar("stinger", 100).toString()).end().end().build();
    schema = MessageTypeParser.parseMessageType("message test {" + " optional int32 x; required binary y; required int32 z;" + " optional binary a;}");
    assertEquals(null, ParquetFilterPredicateConverter.toFilterPredicate(sarg, schema));
}
Also used : HiveDecimalWritable(org.apache.hadoop.hive.serde2.io.HiveDecimalWritable) HiveChar(org.apache.hadoop.hive.common.type.HiveChar) SearchArgument(org.apache.hadoop.hive.ql.io.sarg.SearchArgument) HiveVarchar(org.apache.hadoop.hive.common.type.HiveVarchar) MessageType(org.apache.parquet.schema.MessageType) Test(org.junit.Test)

Aggregations

MessageType (org.apache.parquet.schema.MessageType)46 Test (org.junit.Test)25 FilterPredicate (org.apache.parquet.filter2.predicate.FilterPredicate)15 SearchArgument (org.apache.hadoop.hive.ql.io.sarg.SearchArgument)9 Path (org.apache.hadoop.fs.Path)8 Type (org.apache.parquet.schema.Type)7 GroupType (org.apache.parquet.schema.GroupType)6 Configuration (org.apache.hadoop.conf.Configuration)5 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)5 OriginalType (org.apache.parquet.schema.OriginalType)4 ArrayList (java.util.ArrayList)3 HashMap (java.util.HashMap)3 SchemaPath (org.apache.drill.common.expression.SchemaPath)3 HiveChar (org.apache.hadoop.hive.common.type.HiveChar)3 ParquetMetadata (org.apache.parquet.hadoop.metadata.ParquetMetadata)3 PrimitiveType (org.apache.parquet.schema.PrimitiveType)3 DimensionSchema (io.druid.data.input.impl.DimensionSchema)2 File (java.io.File)2 HashSet (java.util.HashSet)2 FileSystem (org.apache.hadoop.fs.FileSystem)2