Search in sources :

Example 1 with OrcStruct

use of org.apache.hadoop.hive.ql.io.orc.OrcStruct in project hive by apache.

the class StreamingAssert method readRecords.

List<Record> readRecords() throws Exception {
    if (currentDeltas.isEmpty()) {
        throw new AssertionError("No data");
    }
    InputFormat<NullWritable, OrcStruct> inputFormat = new OrcInputFormat();
    JobConf job = new JobConf();
    job.set("mapred.input.dir", partitionLocation.toString());
    job.set("bucket_count", Integer.toString(table.getSd().getNumBuckets()));
    job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, "id,msg");
    job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, "bigint:string");
    job.set(ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN.varname, "true");
    job.set(ValidTxnList.VALID_TXNS_KEY, txns.toString());
    InputSplit[] splits = inputFormat.getSplits(job, 1);
    assertEquals(1, splits.length);
    final AcidRecordReader<NullWritable, OrcStruct> recordReader = (AcidRecordReader<NullWritable, OrcStruct>) inputFormat.getRecordReader(splits[0], job, Reporter.NULL);
    NullWritable key = recordReader.createKey();
    OrcStruct value = recordReader.createValue();
    List<Record> records = new ArrayList<>();
    while (recordReader.next(key, value)) {
        RecordIdentifier recordIdentifier = recordReader.getRecordIdentifier();
        Record record = new Record(new RecordIdentifier(recordIdentifier.getTransactionId(), recordIdentifier.getBucketId(), recordIdentifier.getRowId()), value.toString());
        System.out.println(record);
        records.add(record);
    }
    recordReader.close();
    return records;
}
Also used : ArrayList(java.util.ArrayList) AcidRecordReader(org.apache.hadoop.hive.ql.io.AcidInputFormat.AcidRecordReader) NullWritable(org.apache.hadoop.io.NullWritable) RecordIdentifier(org.apache.hadoop.hive.ql.io.RecordIdentifier) OrcStruct(org.apache.hadoop.hive.ql.io.orc.OrcStruct) OrcInputFormat(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 2 with OrcStruct

use of org.apache.hadoop.hive.ql.io.orc.OrcStruct in project hive by apache.

the class TestCompactor method checkExpectedTxnsPresent.

private void checkExpectedTxnsPresent(Path base, Path[] deltas, String columnNamesProperty, String columnTypesProperty, int bucket, long min, long max) throws IOException {
    ValidTxnList txnList = new ValidTxnList() {

        @Override
        public boolean isTxnValid(long txnid) {
            return true;
        }

        @Override
        public RangeResponse isTxnRangeValid(long minTxnId, long maxTxnId) {
            return RangeResponse.ALL;
        }

        @Override
        public String writeToString() {
            return "";
        }

        @Override
        public void readFromString(String src) {
        }

        @Override
        public long getHighWatermark() {
            return Long.MAX_VALUE;
        }

        @Override
        public long[] getInvalidTransactions() {
            return new long[0];
        }

        @Override
        public boolean isValidBase(long txnid) {
            return true;
        }
    };
    OrcInputFormat aif = new OrcInputFormat();
    Configuration conf = new Configuration();
    conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, columnNamesProperty);
    conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, columnTypesProperty);
    HiveConf.setBoolVar(conf, HiveConf.ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN, true);
    AcidInputFormat.RawReader<OrcStruct> reader = aif.getRawReader(conf, false, bucket, txnList, base, deltas);
    RecordIdentifier identifier = reader.createKey();
    OrcStruct value = reader.createValue();
    long currentTxn = min;
    boolean seenCurrentTxn = false;
    while (reader.next(identifier, value)) {
        if (!seenCurrentTxn) {
            Assert.assertEquals(currentTxn, identifier.getTransactionId());
            seenCurrentTxn = true;
        }
        if (currentTxn != identifier.getTransactionId()) {
            Assert.assertEquals(currentTxn + 1, identifier.getTransactionId());
            currentTxn++;
        }
    }
    Assert.assertEquals(max, currentTxn);
}
Also used : AcidInputFormat(org.apache.hadoop.hive.ql.io.AcidInputFormat) RecordIdentifier(org.apache.hadoop.hive.ql.io.RecordIdentifier) OrcStruct(org.apache.hadoop.hive.ql.io.orc.OrcStruct) Configuration(org.apache.hadoop.conf.Configuration) OrcInputFormat(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat) ValidTxnList(org.apache.hadoop.hive.common.ValidTxnList)

Example 3 with OrcStruct

use of org.apache.hadoop.hive.ql.io.orc.OrcStruct in project hive by apache.

the class TestStreaming method checkDataWritten.

private void checkDataWritten(Path partitionPath, long minTxn, long maxTxn, int buckets, int numExpectedFiles, String... records) throws Exception {
    ValidTxnList txns = msClient.getValidTxns();
    AcidUtils.Directory dir = AcidUtils.getAcidState(partitionPath, conf, txns);
    Assert.assertEquals(0, dir.getObsolete().size());
    Assert.assertEquals(0, dir.getOriginalFiles().size());
    List<AcidUtils.ParsedDelta> current = dir.getCurrentDirectories();
    System.out.println("Files found: ");
    for (AcidUtils.ParsedDelta pd : current) System.out.println(pd.getPath().toString());
    Assert.assertEquals(numExpectedFiles, current.size());
    // find the absolute minimum transaction
    long min = Long.MAX_VALUE;
    long max = Long.MIN_VALUE;
    for (AcidUtils.ParsedDelta pd : current) {
        if (pd.getMaxTransaction() > max)
            max = pd.getMaxTransaction();
        if (pd.getMinTransaction() < min)
            min = pd.getMinTransaction();
    }
    Assert.assertEquals(minTxn, min);
    Assert.assertEquals(maxTxn, max);
    InputFormat inf = new OrcInputFormat();
    JobConf job = new JobConf();
    job.set("mapred.input.dir", partitionPath.toString());
    job.set("bucket_count", Integer.toString(buckets));
    job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, "id,msg");
    job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, "bigint:string");
    job.set(ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN.varname, "true");
    job.set(ValidTxnList.VALID_TXNS_KEY, txns.toString());
    InputSplit[] splits = inf.getSplits(job, buckets);
    Assert.assertEquals(buckets, splits.length);
    org.apache.hadoop.mapred.RecordReader<NullWritable, OrcStruct> rr = inf.getRecordReader(splits[0], job, Reporter.NULL);
    NullWritable key = rr.createKey();
    OrcStruct value = rr.createValue();
    for (int i = 0; i < records.length; i++) {
        Assert.assertEquals(true, rr.next(key, value));
        Assert.assertEquals(records[i], value.toString());
    }
    Assert.assertEquals(false, rr.next(key, value));
}
Also used : NullWritable(org.apache.hadoop.io.NullWritable) OrcStruct(org.apache.hadoop.hive.ql.io.orc.OrcStruct) OrcInputFormat(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat) InputFormat(org.apache.hadoop.mapred.InputFormat) OrcInputFormat(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat) ValidTxnList(org.apache.hadoop.hive.common.ValidTxnList) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) OrcAcidUtils(org.apache.orc.impl.OrcAcidUtils) AcidUtils(org.apache.hadoop.hive.ql.io.AcidUtils)

Example 4 with OrcStruct

use of org.apache.hadoop.hive.ql.io.orc.OrcStruct in project druid by druid-io.

the class DruidOrcInputFormatTest method testRead.

@Test
public void testRead() throws IOException, InterruptedException {
    InputFormat inputFormat = ReflectionUtils.newInstance(OrcNewInputFormat.class, job.getConfiguration());
    TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
    RecordReader reader = inputFormat.createRecordReader(split, context);
    OrcHadoopInputRowParser parser = (OrcHadoopInputRowParser) config.getParser();
    reader.initialize(split, context);
    reader.nextKeyValue();
    OrcStruct data = (OrcStruct) reader.getCurrentValue();
    MapBasedInputRow row = (MapBasedInputRow) parser.parse(data);
    Assert.assertTrue(row.getEvent().keySet().size() == 4);
    Assert.assertEquals(new DateTime(timestamp), row.getTimestamp());
    Assert.assertEquals(parser.getParseSpec().getDimensionsSpec().getDimensionNames(), row.getDimensions());
    Assert.assertEquals(col1, row.getEvent().get("col1"));
    Assert.assertEquals(Arrays.asList(col2), row.getDimension("col2"));
    reader.close();
}
Also used : OrcStruct(org.apache.hadoop.hive.ql.io.orc.OrcStruct) OrcNewInputFormat(org.apache.hadoop.hive.ql.io.orc.OrcNewInputFormat) InputFormat(org.apache.hadoop.mapreduce.InputFormat) TaskAttemptID(org.apache.hadoop.mapreduce.TaskAttemptID) TaskAttemptContextImpl(org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl) RecordReader(org.apache.hadoop.mapreduce.RecordReader) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) MapBasedInputRow(io.druid.data.input.MapBasedInputRow) DateTime(org.joda.time.DateTime) Test(org.junit.Test)

Example 5 with OrcStruct

use of org.apache.hadoop.hive.ql.io.orc.OrcStruct in project presto by prestodb.

the class OrcFileRewriter method uncompressedSize.

private static int uncompressedSize(Object object) throws IOException {
    if (object instanceof OrcStruct) {
        OrcStruct struct = (OrcStruct) object;
        int size = 0;
        for (int i = 0; i < struct.getNumFields(); i++) {
            size += uncompressedSize(getFieldValue(struct, i));
        }
        return size;
    }
    if ((object == null) || (object instanceof BooleanWritable)) {
        return SIZE_OF_BYTE;
    }
    if (object instanceof LongWritable) {
        return SIZE_OF_LONG;
    }
    if (object instanceof DoubleWritable) {
        return SIZE_OF_DOUBLE;
    }
    if (object instanceof Text) {
        return ((Text) object).getLength();
    }
    if (object instanceof BytesWritable) {
        return ((BytesWritable) object).getLength();
    }
    if (object instanceof List<?>) {
        int size = 0;
        for (Object element : (Iterable<?>) object) {
            size += uncompressedSize(element);
        }
        return size;
    }
    if (object instanceof Map<?, ?>) {
        int size = 0;
        for (Map.Entry<?, ?> entry : ((Map<?, ?>) object).entrySet()) {
            size += uncompressedSize(entry.getKey());
            size += uncompressedSize(entry.getValue());
        }
        return size;
    }
    throw new IOException("Unhandled ORC object: " + object.getClass().getName());
}
Also used : DoubleWritable(org.apache.hadoop.hive.serde2.io.DoubleWritable) Text(org.apache.hadoop.io.Text) BytesWritable(org.apache.hadoop.io.BytesWritable) InterruptedIOException(java.io.InterruptedIOException) IOException(java.io.IOException) OrcStruct(org.apache.hadoop.hive.ql.io.orc.OrcStruct) BooleanWritable(org.apache.hadoop.io.BooleanWritable) List(java.util.List) LongWritable(org.apache.hadoop.io.LongWritable) Map(java.util.Map)

Aggregations

OrcStruct (org.apache.hadoop.hive.ql.io.orc.OrcStruct)5 OrcInputFormat (org.apache.hadoop.hive.ql.io.orc.OrcInputFormat)3 ValidTxnList (org.apache.hadoop.hive.common.ValidTxnList)2 RecordIdentifier (org.apache.hadoop.hive.ql.io.RecordIdentifier)2 NullWritable (org.apache.hadoop.io.NullWritable)2 InputSplit (org.apache.hadoop.mapred.InputSplit)2 JobConf (org.apache.hadoop.mapred.JobConf)2 MapBasedInputRow (io.druid.data.input.MapBasedInputRow)1 IOException (java.io.IOException)1 InterruptedIOException (java.io.InterruptedIOException)1 ArrayList (java.util.ArrayList)1 List (java.util.List)1 Map (java.util.Map)1 Configuration (org.apache.hadoop.conf.Configuration)1 AcidInputFormat (org.apache.hadoop.hive.ql.io.AcidInputFormat)1 AcidRecordReader (org.apache.hadoop.hive.ql.io.AcidInputFormat.AcidRecordReader)1 AcidUtils (org.apache.hadoop.hive.ql.io.AcidUtils)1 OrcNewInputFormat (org.apache.hadoop.hive.ql.io.orc.OrcNewInputFormat)1 DoubleWritable (org.apache.hadoop.hive.serde2.io.DoubleWritable)1 BooleanWritable (org.apache.hadoop.io.BooleanWritable)1