Search in sources :

Example 21 with OrcStruct

use of org.apache.orc.mapred.OrcStruct in project incubator-gobblin by apache.

the class OrcUtilsTest method testNestedFieldSequenceSet.

/**
 * This test mostly target at the following case:
 * Schema: struct<a:array<struct<a:int,b:int>>>
 * field a was set to null by one call of "upConvertOrcStruct", but the subsequent call should still have the nested
 * field filled.
 */
public void testNestedFieldSequenceSet() {
    TypeDescription schema = TypeDescription.fromString("struct<a:array<struct<a:int,b:int>>>");
    OrcStruct struct = (OrcStruct) OrcUtils.createValueRecursively(schema);
    OrcTestUtils.fillOrcStructWithFixedValue(struct, schema, 1, "test", true);
    OrcStruct structWithEmptyArray = (OrcStruct) OrcUtils.createValueRecursively(schema);
    OrcTestUtils.fillOrcStructWithFixedValue(structWithEmptyArray, schema, 1, "test", true);
    structWithEmptyArray.setFieldValue("a", null);
    OrcUtils.upConvertOrcStruct(structWithEmptyArray, struct, schema);
    Assert.assertEquals(struct, structWithEmptyArray);
    OrcStruct struct_2 = (OrcStruct) OrcUtils.createValueRecursively(schema);
    OrcTestUtils.fillOrcStructWithFixedValue(struct_2, schema, 2, "test", true);
    OrcUtils.upConvertOrcStruct(struct_2, struct, schema);
    Assert.assertEquals(struct, struct_2);
}
Also used : OrcStruct(org.apache.orc.mapred.OrcStruct) TypeDescription(org.apache.orc.TypeDescription)

Example 22 with OrcStruct

use of org.apache.orc.mapred.OrcStruct in project incubator-gobblin by apache.

the class GenericRecordToOrcValueWriterTest method deserializeOrcRecords.

public static final List<Writable> deserializeOrcRecords(Path orcFilePath, FileSystem fs) throws IOException {
    org.apache.orc.Reader fileReader = OrcFile.createReader(orcFilePath, new OrcFile.ReaderOptions(new Configuration()));
    RecordReader recordReader = fileReader.rows();
    TypeDescription schema = fileReader.getSchema();
    VectorizedRowBatch batch = schema.createRowBatch();
    recordReader.nextBatch(batch);
    int rowInBatch = 0;
    // result container
    List<Writable> orcRecords = new ArrayList<>();
    long rowCount = fileReader.getNumberOfRows();
    while (rowCount > 0) {
        // Deserialize records using Mapreduce-like API
        if (schema.getCategory() == TypeDescription.Category.STRUCT) {
            OrcStruct result = (OrcStruct) OrcStruct.createValue(fileReader.getSchema());
            List<TypeDescription> children = schema.getChildren();
            int numberOfChildren = children.size();
            for (int i = 0; i < numberOfChildren; ++i) {
                result.setFieldValue(i, nextValue(batch.cols[i], rowInBatch, children.get(i), result.getFieldValue(i)));
            }
            orcRecords.add(result);
        } else {
            throw new UnsupportedOperationException("The serialized records have to be a struct in the outer-most layer.");
        }
        rowCount -= 1;
        rowInBatch += 1;
    }
    return orcRecords;
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) RecordReader(org.apache.orc.RecordReader) ArrayList(java.util.ArrayList) Writable(org.apache.hadoop.io.Writable) IntWritable(org.apache.hadoop.io.IntWritable) VectorizedRowBatch(org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch) OrcStruct(org.apache.orc.mapred.OrcStruct) OrcFile(org.apache.orc.OrcFile) TypeDescription(org.apache.orc.TypeDescription)

Example 23 with OrcStruct

use of org.apache.orc.mapred.OrcStruct in project incubator-gobblin by apache.

the class GenericRecordToOrcValueWriterTest method getUnionFieldFromStruct.

/**
 * Accessing "fields" using reflection to work-around access modifiers.
 */
private OrcUnion getUnionFieldFromStruct(Writable struct) {
    try {
        OrcStruct orcStruct = (OrcStruct) struct;
        Field objectArr = OrcStruct.class.getDeclaredField("fields");
        objectArr.setAccessible(true);
        return (OrcUnion) ((Object[]) objectArr.get(orcStruct))[0];
    } catch (Exception e) {
        throw new RuntimeException("Cannot access with reflection", e);
    }
}
Also used : Field(java.lang.reflect.Field) OrcStruct(org.apache.orc.mapred.OrcStruct) OrcUnion(org.apache.orc.mapred.OrcUnion) IOException(java.io.IOException)

Example 24 with OrcStruct

use of org.apache.orc.mapred.OrcStruct in project druid by druid-io.

the class OrcStructConverter method convertField.

/**
 * Convert a orc struct field as though it were a map, by fieldIndex. Complex types will be transformed
 * into java lists and maps when possible ({@link OrcStructConverter#convertList} and
 * {@link OrcStructConverter#convertMap}), and
 * primitive types will be extracted into an ingestion friendly state (e.g. 'int' and 'long'). Finally,
 * if a field is not present, this method will return null.
 *
 * Note: "Union" types are not currently supported and will be returned as null
 */
@Nullable
Object convertField(OrcStruct struct, int fieldIndex) {
    if (fieldIndex < 0) {
        return null;
    }
    TypeDescription schema = struct.getSchema();
    TypeDescription fieldDescription = schema.getChildren().get(fieldIndex);
    WritableComparable fieldValue = struct.getFieldValue(fieldIndex);
    if (fieldValue == null) {
        return null;
    }
    if (fieldDescription.getCategory().isPrimitive()) {
        return convertPrimitive(fieldDescription, fieldValue, binaryAsString);
    } else {
        /*
          ORC TYPE    WRITABLE TYPE
          array       org.apache.orc.mapred.OrcList
          map         org.apache.orc.mapred.OrcMap
          struct      org.apache.orc.mapred.OrcStruct
          uniontype   org.apache.orc.mapred.OrcUnion
       */
        switch(fieldDescription.getCategory()) {
            case LIST:
                OrcList orcList = (OrcList) fieldValue;
                return convertList(fieldDescription, orcList, binaryAsString);
            case MAP:
                OrcMap map = (OrcMap) fieldValue;
                return convertMap(fieldDescription, map, binaryAsString);
            case STRUCT:
                OrcStruct structMap = (OrcStruct) fieldValue;
                return convertStructToMap(structMap);
            case UNION:
            // sorry union types :(
            default:
                return null;
        }
    }
}
Also used : OrcStruct(org.apache.orc.mapred.OrcStruct) WritableComparable(org.apache.hadoop.io.WritableComparable) OrcList(org.apache.orc.mapred.OrcList) TypeDescription(org.apache.orc.TypeDescription) OrcMap(org.apache.orc.mapred.OrcMap) Nullable(javax.annotation.Nullable)

Example 25 with OrcStruct

use of org.apache.orc.mapred.OrcStruct in project druid by druid-io.

the class OrcHadoopInputRowParserTest method getFirstRow.

private static OrcStruct getFirstRow(Job job, String orcPath) throws IOException {
    File testFile = new File(orcPath);
    Path path = new Path(testFile.getAbsoluteFile().toURI());
    FileSplit split = new FileSplit(path, 0, testFile.length(), new String[] { "host" });
    InputFormat<NullWritable, OrcStruct> inputFormat = ReflectionUtils.newInstance(OrcInputFormat.class, job.getConfiguration());
    RecordReader<NullWritable, OrcStruct> reader = inputFormat.getRecordReader(split, new JobConf(job.getConfiguration()), null);
    try {
        final NullWritable key = reader.createKey();
        final OrcStruct value = reader.createValue();
        if (reader.next(key, value)) {
            return value;
        } else {
            throw new NoSuchElementException();
        }
    } finally {
        reader.close();
    }
}
Also used : Path(org.apache.hadoop.fs.Path) OrcStruct(org.apache.orc.mapred.OrcStruct) FileSplit(org.apache.hadoop.mapred.FileSplit) File(java.io.File) NullWritable(org.apache.hadoop.io.NullWritable) JobConf(org.apache.hadoop.mapred.JobConf) NoSuchElementException(java.util.NoSuchElementException)

Aggregations

OrcStruct (org.apache.orc.mapred.OrcStruct)39 TypeDescription (org.apache.orc.TypeDescription)24 Configuration (org.apache.hadoop.conf.Configuration)18 IntWritable (org.apache.hadoop.io.IntWritable)17 Test (org.testng.annotations.Test)15 ArrayList (java.util.ArrayList)13 Test (org.junit.Test)9 OrcFile (org.apache.orc.OrcFile)8 OrcList (org.apache.orc.mapred.OrcList)8 File (java.io.File)7 InputRow (org.apache.druid.data.input.InputRow)7 Path (org.apache.hadoop.fs.Path)7 Job (org.apache.hadoop.mapreduce.Job)7 OrcUnion (org.apache.orc.mapred.OrcUnion)7 ImmutableList (com.google.common.collect.ImmutableList)6 List (java.util.List)6 HadoopDruidIndexerConfig (org.apache.druid.indexer.HadoopDruidIndexerConfig)6 Text (org.apache.hadoop.io.Text)6 OrcKey (org.apache.orc.mapred.OrcKey)5 OrcMap (org.apache.orc.mapred.OrcMap)5