Search in sources :

Example 6 with BytesRefWritable

use of org.apache.hadoop.hive.serde2.columnar.BytesRefWritable in project hive by apache.

the class RCFileGenerator method genData.

private static void genData(String format, int numRows, String output, String plainOutput) throws Exception {
    int numFields = 0;
    if (format.equals("student")) {
        rand = new Random(numRows);
        numFields = 3;
    } else if (format.equals("voter")) {
        rand = new Random(1000000000 + numRows);
        numFields = 4;
    } else if (format.equals("alltypes")) {
        rand = new Random(2000000000L + numRows);
        numFields = 10;
    }
    RCFileOutputFormat.setColumnNumber(conf, numFields);
    RCFile.Writer writer = new RCFile.Writer(fs, conf, getFile(output), null, new DefaultCodec());
    PrintWriter pw = new PrintWriter(new FileWriter(plainOutput));
    for (int j = 0; j < numRows; j++) {
        BytesRefArrayWritable row = new BytesRefArrayWritable(numFields);
        byte[][] fields = null;
        if (format.equals("student")) {
            byte[][] f = { randomName().getBytes("UTF-8"), Integer.valueOf(randomAge()).toString().getBytes("UTF-8"), Double.valueOf(randomGpa()).toString().getBytes("UTF-8") };
            fields = f;
        } else if (format.equals("voter")) {
            byte[][] f = { randomName().getBytes("UTF-8"), Integer.valueOf(randomAge()).toString().getBytes("UTF-8"), randomRegistration().getBytes("UTF-8"), Double.valueOf(randomContribution()).toString().getBytes("UTF-8") };
            fields = f;
        } else if (format.equals("alltypes")) {
            byte[][] f = { Integer.valueOf(rand.nextInt(Byte.MAX_VALUE)).toString().getBytes("UTF-8"), Integer.valueOf(rand.nextInt(Short.MAX_VALUE)).toString().getBytes("UTF-8"), Integer.valueOf(rand.nextInt()).toString().getBytes("UTF-8"), Long.valueOf(rand.nextLong()).toString().getBytes("UTF-8"), Float.valueOf(rand.nextFloat() * 1000).toString().getBytes("UTF-8"), Double.valueOf(rand.nextDouble() * 1000000).toString().getBytes("UTF-8"), randomName().getBytes("UTF-8"), randomMap(), randomArray() };
            fields = f;
        }
        for (int i = 0; i < fields.length; i++) {
            BytesRefWritable field = new BytesRefWritable(fields[i], 0, fields[i].length);
            row.set(i, field);
            pw.print(new String(fields[i]));
            if (i != fields.length - 1)
                pw.print("\t");
            else
                pw.println();
        }
        writer.append(row);
    }
    writer.close();
    pw.close();
}
Also used : RCFile(org.apache.hadoop.hive.ql.io.RCFile) Random(java.util.Random) BytesRefArrayWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable) FileWriter(java.io.FileWriter) DefaultCodec(org.apache.hadoop.io.compress.DefaultCodec) PrintWriter(java.io.PrintWriter) FileWriter(java.io.FileWriter) PrintWriter(java.io.PrintWriter) BytesRefWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefWritable)

Example 7 with BytesRefWritable

use of org.apache.hadoop.hive.serde2.columnar.BytesRefWritable in project hive by apache.

the class RCFileCat method printRecord.

/**
   * Print record to string builder
   * @param value
   * @param buf
   * @throws IOException
   */
private void printRecord(BytesRefArrayWritable value, StringBuilder buf) throws IOException {
    int n = value.size();
    if (n > 0) {
        BytesRefWritable v = value.unCheckedGet(0);
        ByteBuffer bb = ByteBuffer.wrap(v.getData(), v.getStart(), v.getLength());
        buf.append(decoder.decode(bb));
        for (int i = 1; i < n; i++) {
            // do not put the TAB for the last column
            buf.append(RCFileCat.TAB);
            v = value.unCheckedGet(i);
            bb = ByteBuffer.wrap(v.getData(), v.getStart(), v.getLength());
            buf.append(decoder.decode(bb));
        }
        buf.append(RCFileCat.NEWLINE);
    }
}
Also used : ByteBuffer(java.nio.ByteBuffer) BytesRefWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefWritable)

Example 8 with BytesRefWritable

use of org.apache.hadoop.hive.serde2.columnar.BytesRefWritable in project hive by apache.

the class TestRCFileMapReduceInputFormat method writeThenReadByRecordReader.

private void writeThenReadByRecordReader(int intervalRecordCount, int writeCount, int splitNumber, long maxSplitSize, CompressionCodec codec) throws IOException, InterruptedException {
    Path testDir = new Path(System.getProperty("test.tmp.dir", ".") + "/mapred/testsmallfirstsplit");
    Path testFile = new Path(testDir, "test_rcfile");
    fs.delete(testFile, true);
    Configuration cloneConf = new Configuration(conf);
    RCFileOutputFormat.setColumnNumber(cloneConf, bytesArray.length);
    cloneConf.setInt(HiveConf.ConfVars.HIVE_RCFILE_RECORD_INTERVAL.varname, intervalRecordCount);
    RCFile.Writer writer = new RCFile.Writer(fs, cloneConf, testFile, null, codec);
    BytesRefArrayWritable bytes = new BytesRefArrayWritable(bytesArray.length);
    for (int i = 0; i < bytesArray.length; i++) {
        BytesRefWritable cu = null;
        cu = new BytesRefWritable(bytesArray[i], 0, bytesArray[i].length);
        bytes.set(i, cu);
    }
    for (int i = 0; i < writeCount; i++) {
        writer.append(bytes);
    }
    writer.close();
    RCFileMapReduceInputFormat<LongWritable, BytesRefArrayWritable> inputFormat = new RCFileMapReduceInputFormat<LongWritable, BytesRefArrayWritable>();
    Configuration jonconf = new Configuration(cloneConf);
    jonconf.set("mapred.input.dir", testDir.toString());
    JobContext context = new Job(jonconf);
    HiveConf.setLongVar(context.getConfiguration(), HiveConf.ConfVars.MAPREDMAXSPLITSIZE, maxSplitSize);
    List<InputSplit> splits = inputFormat.getSplits(context);
    assertEquals("splits length should be " + splitNumber, splits.size(), splitNumber);
    int readCount = 0;
    for (int i = 0; i < splits.size(); i++) {
        TaskAttemptContext tac = ShimLoader.getHadoopShims().getHCatShim().createTaskAttemptContext(jonconf, new TaskAttemptID());
        RecordReader<LongWritable, BytesRefArrayWritable> rr = inputFormat.createRecordReader(splits.get(i), tac);
        rr.initialize(splits.get(i), tac);
        while (rr.nextKeyValue()) {
            readCount++;
        }
    }
    assertEquals("readCount should be equal to writeCount", readCount, writeCount);
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) BytesRefArrayWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable) TaskAttemptID(org.apache.hadoop.mapreduce.TaskAttemptID) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) RCFile(org.apache.hadoop.hive.ql.io.RCFile) LongWritable(org.apache.hadoop.io.LongWritable) JobContext(org.apache.hadoop.mapreduce.JobContext) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit) BytesRefWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefWritable)

Example 9 with BytesRefWritable

use of org.apache.hadoop.hive.serde2.columnar.BytesRefWritable in project presto by prestodb.

the class ColumnarBinaryHiveRecordCursor method parseObjectColumn.

private void parseObjectColumn(int column) {
    loaded[column] = true;
    if (hiveColumnIndexes[column] >= value.size()) {
        // this partition may contain fewer fields than what's declared in the schema
        // this happens when additional columns are added to the hive table after a partition has been created
        nulls[column] = true;
    } else {
        BytesRefWritable fieldData = value.unCheckedGet(hiveColumnIndexes[column]);
        byte[] bytes;
        try {
            bytes = fieldData.getData();
        } catch (IOException e) {
            throw Throwables.propagate(e);
        }
        int start = fieldData.getStart();
        int length = fieldData.getLength();
        parseObjectColumn(column, bytes, start, length);
    }
}
Also used : IOException(java.io.IOException) BytesRefWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefWritable)

Example 10 with BytesRefWritable

use of org.apache.hadoop.hive.serde2.columnar.BytesRefWritable in project presto by prestodb.

the class ColumnarBinaryHiveRecordCursor method parseDoubleColumn.

private void parseDoubleColumn(int column) {
    loaded[column] = true;
    if (hiveColumnIndexes[column] >= value.size()) {
        // this partition may contain fewer fields than what's declared in the schema
        // this happens when additional columns are added to the hive table after a partition has been created
        nulls[column] = true;
    } else {
        BytesRefWritable fieldData = value.unCheckedGet(hiveColumnIndexes[column]);
        byte[] bytes;
        try {
            bytes = fieldData.getData();
        } catch (IOException e) {
            throw Throwables.propagate(e);
        }
        int start = fieldData.getStart();
        int length = fieldData.getLength();
        parseDoubleColumn(column, bytes, start, length);
    }
}
Also used : IOException(java.io.IOException) BytesRefWritable(org.apache.hadoop.hive.serde2.columnar.BytesRefWritable)

Aggregations

BytesRefWritable (org.apache.hadoop.hive.serde2.columnar.BytesRefWritable)28 IOException (java.io.IOException)14 BytesRefArrayWritable (org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable)14 Configuration (org.apache.hadoop.conf.Configuration)6 Path (org.apache.hadoop.fs.Path)5 DefaultCodec (org.apache.hadoop.io.compress.DefaultCodec)5 LongWritable (org.apache.hadoop.io.LongWritable)4 RecordReader (org.apache.hadoop.mapred.RecordReader)4 Test (org.junit.Test)4 Random (java.util.Random)2 RCFile (org.apache.hadoop.hive.ql.io.RCFile)2 ColumnarSerDe (org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe)2 ByteWritable (org.apache.hadoop.hive.serde2.io.ByteWritable)2 Text (org.apache.hadoop.io.Text)2 InputSplit (org.apache.hadoop.mapred.InputSplit)2 JobConf (org.apache.hadoop.mapred.JobConf)2 ColumnEntry (com.alibaba.datax.plugin.unstructuredstorage.reader.ColumnEntry)1 PrestoException (com.facebook.presto.spi.PrestoException)1 FileWriter (java.io.FileWriter)1 PrintWriter (java.io.PrintWriter)1