Search in sources :

Example 1 with Writer

use of org.apache.orc.Writer in project flink by apache.

the class OrcColumnarRowSplitReaderNoHiveTest method prepareReadFileWithTypes.

@Override
protected void prepareReadFileWithTypes(String file, int rowSize) throws IOException {
    // NOTE: orc has field name information, so name should be same as orc
    TypeDescription schema = TypeDescription.fromString("struct<" + "f0:float," + "f1:double," + "f2:timestamp," + "f3:tinyint," + "f4:smallint" + ">");
    org.apache.hadoop.fs.Path filePath = new org.apache.hadoop.fs.Path(file);
    Configuration conf = new Configuration();
    Writer writer = OrcFile.createWriter(filePath, OrcFile.writerOptions(conf).setSchema(schema));
    VectorizedRowBatch batch = schema.createRowBatch(rowSize);
    DoubleColumnVector col0 = (DoubleColumnVector) batch.cols[0];
    DoubleColumnVector col1 = (DoubleColumnVector) batch.cols[1];
    TimestampColumnVector col2 = (TimestampColumnVector) batch.cols[2];
    LongColumnVector col3 = (LongColumnVector) batch.cols[3];
    LongColumnVector col4 = (LongColumnVector) batch.cols[4];
    col0.noNulls = false;
    col1.noNulls = false;
    col2.noNulls = false;
    col3.noNulls = false;
    col4.noNulls = false;
    for (int i = 0; i < rowSize - 1; i++) {
        col0.vector[i] = i;
        col1.vector[i] = i;
        Timestamp timestamp = toTimestamp(i);
        col2.time[i] = timestamp.getTime();
        col2.nanos[i] = timestamp.getNanos();
        col3.vector[i] = i;
        col4.vector[i] = i;
    }
    col0.isNull[rowSize - 1] = true;
    col1.isNull[rowSize - 1] = true;
    col2.isNull[rowSize - 1] = true;
    col3.isNull[rowSize - 1] = true;
    col4.isNull[rowSize - 1] = true;
    batch.size = rowSize;
    writer.addRowBatch(batch);
    batch.reset();
    writer.close();
}
Also used : TimestampColumnVector(org.apache.orc.storage.ql.exec.vector.TimestampColumnVector) DoubleColumnVector(org.apache.orc.storage.ql.exec.vector.DoubleColumnVector) Configuration(org.apache.hadoop.conf.Configuration) Timestamp(java.sql.Timestamp) VectorizedRowBatch(org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch) TypeDescription(org.apache.orc.TypeDescription) Writer(org.apache.orc.Writer) LongColumnVector(org.apache.orc.storage.ql.exec.vector.LongColumnVector)

Example 2 with Writer

use of org.apache.orc.Writer in project zeppelin by apache.

the class SqlInterpreterTest method createORCFile.

public File createORCFile(int[] values) throws IOException {
    File file = File.createTempFile("zeppelin-flink-input", ".orc");
    file.delete();
    Path path = new Path(file.getAbsolutePath());
    Configuration conf = new Configuration();
    conf.set("orc.compress", "snappy");
    TypeDescription schema = TypeDescription.fromString("struct<msg:int>");
    Writer writer = OrcFile.createWriter(path, OrcFile.writerOptions(conf).setSchema(schema));
    VectorizedRowBatch batch = schema.createRowBatch();
    LongColumnVector x = (LongColumnVector) batch.cols[0];
    for (int i = 0; i < values.length; ++i) {
        int row = batch.size++;
        x.vector[row] = values[i];
        // If the batch is full, write it out and start over.
        if (batch.size == batch.getMaxSize()) {
            writer.addRowBatch(batch);
            batch.reset();
        }
    }
    if (batch.size != 0) {
        writer.addRowBatch(batch);
        batch.reset();
    }
    writer.close();
    return file;
}
Also used : Path(org.apache.hadoop.fs.Path) VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) Configuration(org.apache.hadoop.conf.Configuration) TypeDescription(org.apache.orc.TypeDescription) OrcFile(org.apache.orc.OrcFile) File(java.io.File) PrintWriter(java.io.PrintWriter) FileWriter(java.io.FileWriter) Writer(org.apache.orc.Writer) ParquetWriter(org.apache.parquet.hadoop.ParquetWriter) LongColumnVector(org.apache.hadoop.hive.ql.exec.vector.LongColumnVector)

Example 3 with Writer

use of org.apache.orc.Writer in project druid by druid-io.

the class DruidOrcInputFormatTest method makeOrcFile.

private File makeOrcFile() throws IOException {
    final File dir = temporaryFolder.newFolder();
    final File testOrc = new File(dir, "test.orc");
    TypeDescription schema = TypeDescription.createStruct().addField("timestamp", TypeDescription.createString()).addField("col1", TypeDescription.createString()).addField("col2", TypeDescription.createList(TypeDescription.createString())).addField("val1", TypeDescription.createFloat());
    Configuration conf = new Configuration();
    Writer writer = OrcFile.createWriter(new Path(testOrc.getPath()), OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000).bufferSize(10000).compress(CompressionKind.ZLIB).version(OrcFile.Version.CURRENT));
    VectorizedRowBatch batch = schema.createRowBatch();
    batch.size = 1;
    ((BytesColumnVector) batch.cols[0]).setRef(0, timestamp.getBytes(), 0, timestamp.length());
    ((BytesColumnVector) batch.cols[1]).setRef(0, col1.getBytes(), 0, col1.length());
    ListColumnVector listColumnVector = (ListColumnVector) batch.cols[2];
    listColumnVector.childCount = col2.length;
    listColumnVector.lengths[0] = 3;
    for (int idx = 0; idx < col2.length; idx++) {
        ((BytesColumnVector) listColumnVector.child).setRef(idx, col2[idx].getBytes(), 0, col2[idx].length());
    }
    ((DoubleColumnVector) batch.cols[3]).vector[0] = val1;
    writer.addRowBatch(batch);
    writer.close();
    return testOrc;
}
Also used : Path(org.apache.hadoop.fs.Path) VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) DoubleColumnVector(org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector) Configuration(org.apache.hadoop.conf.Configuration) ListColumnVector(org.apache.hadoop.hive.ql.exec.vector.ListColumnVector) BytesColumnVector(org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector) TypeDescription(org.apache.orc.TypeDescription) OrcFile(org.apache.orc.OrcFile) File(java.io.File) Writer(org.apache.orc.Writer)

Example 4 with Writer

use of org.apache.orc.Writer in project flink by apache.

the class OrcColumnarRowSplitReaderTest method prepareReadFileWithTypes.

protected void prepareReadFileWithTypes(String file, int rowSize) throws IOException {
    // NOTE: orc has field name information, so name should be same as orc
    TypeDescription schema = TypeDescription.fromString("struct<" + "f0:float," + "f1:double," + "f2:timestamp," + "f3:tinyint," + "f4:smallint" + ">");
    org.apache.hadoop.fs.Path filePath = new org.apache.hadoop.fs.Path(file);
    Configuration conf = new Configuration();
    Writer writer = OrcFile.createWriter(filePath, OrcFile.writerOptions(conf).setSchema(schema));
    VectorizedRowBatch batch = schema.createRowBatch(rowSize);
    DoubleColumnVector col0 = (DoubleColumnVector) batch.cols[0];
    DoubleColumnVector col1 = (DoubleColumnVector) batch.cols[1];
    TimestampColumnVector col2 = (TimestampColumnVector) batch.cols[2];
    LongColumnVector col3 = (LongColumnVector) batch.cols[3];
    LongColumnVector col4 = (LongColumnVector) batch.cols[4];
    col0.noNulls = false;
    col1.noNulls = false;
    col2.noNulls = false;
    col3.noNulls = false;
    col4.noNulls = false;
    for (int i = 0; i < rowSize - 1; i++) {
        col0.vector[i] = i;
        col1.vector[i] = i;
        Timestamp timestamp = toTimestamp(i);
        col2.time[i] = timestamp.getTime();
        col2.nanos[i] = timestamp.getNanos();
        col3.vector[i] = i;
        col4.vector[i] = i;
    }
    col0.isNull[rowSize - 1] = true;
    col1.isNull[rowSize - 1] = true;
    col2.isNull[rowSize - 1] = true;
    col3.isNull[rowSize - 1] = true;
    col4.isNull[rowSize - 1] = true;
    batch.size = rowSize;
    writer.addRowBatch(batch);
    batch.reset();
    writer.close();
}
Also used : Path(org.apache.flink.core.fs.Path) TimestampColumnVector(org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector) DoubleColumnVector(org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector) Configuration(org.apache.hadoop.conf.Configuration) Timestamp(java.sql.Timestamp) VectorizedRowBatch(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch) TypeDescription(org.apache.orc.TypeDescription) Writer(org.apache.orc.Writer) LongColumnVector(org.apache.hadoop.hive.ql.exec.vector.LongColumnVector)

Aggregations

Configuration (org.apache.hadoop.conf.Configuration)4 TypeDescription (org.apache.orc.TypeDescription)4 Writer (org.apache.orc.Writer)4 VectorizedRowBatch (org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch)3 File (java.io.File)2 Timestamp (java.sql.Timestamp)2 Path (org.apache.hadoop.fs.Path)2 DoubleColumnVector (org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector)2 LongColumnVector (org.apache.hadoop.hive.ql.exec.vector.LongColumnVector)2 OrcFile (org.apache.orc.OrcFile)2 FileWriter (java.io.FileWriter)1 PrintWriter (java.io.PrintWriter)1 Path (org.apache.flink.core.fs.Path)1 BytesColumnVector (org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector)1 ListColumnVector (org.apache.hadoop.hive.ql.exec.vector.ListColumnVector)1 TimestampColumnVector (org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector)1 DoubleColumnVector (org.apache.orc.storage.ql.exec.vector.DoubleColumnVector)1 LongColumnVector (org.apache.orc.storage.ql.exec.vector.LongColumnVector)1 TimestampColumnVector (org.apache.orc.storage.ql.exec.vector.TimestampColumnVector)1 VectorizedRowBatch (org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch)1