Search in sources :

Example 6 with RecordUpdater

use of org.apache.hadoop.hive.ql.io.RecordUpdater in project hive by apache.

the class AbstractRecordWriter method closeBatch.

@Override
public void closeBatch() throws StreamingIOFailure {
    boolean haveError = false;
    for (RecordUpdater updater : updaters) {
        if (updater != null) {
            try {
                // try not to leave any files open
                updater.close(false);
            } catch (Exception ex) {
                haveError = true;
                LOG.error("Unable to close " + updater + " due to: " + ex.getMessage(), ex);
            }
        }
    }
    updaters.clear();
    if (haveError) {
        throw new StreamingIOFailure("Encountered errors while closing (see logs) " + getWatermark());
    }
}
Also used : RecordUpdater(org.apache.hadoop.hive.ql.io.RecordUpdater) MetaException(org.apache.hadoop.hive.metastore.api.MetaException) TException(org.apache.thrift.TException) IOException(java.io.IOException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) NoSuchObjectException(org.apache.hadoop.hive.metastore.api.NoSuchObjectException)

Example 7 with RecordUpdater

use of org.apache.hadoop.hive.ql.io.RecordUpdater in project hive by apache.

the class TestOrcRawRecordMerger method testNewBaseAndDelta.

private void testNewBaseAndDelta(boolean use130Format) throws Exception {
    final int BUCKET = 10;
    String[] values = new String[] { "first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth", "ninth", "tenth" };
    Configuration conf = new Configuration();
    OrcOutputFormat of = new OrcOutputFormat();
    FileSystem fs = FileSystem.getLocal(conf);
    Path root = new Path(tmpDir, "testNewBaseAndDelta").makeQualified(fs);
    fs.delete(root, true);
    ObjectInspector inspector;
    synchronized (TestOrcFile.class) {
        inspector = ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    // write the base
    AcidOutputFormat.Options options = new AcidOutputFormat.Options(conf).inspector(inspector).bucket(BUCKET).finalDestination(root);
    final int BUCKET_PROPERTY = BucketCodec.V1.encode(options);
    if (!use130Format) {
        options.statementId(-1);
    }
    RecordUpdater ru = of.getRecordUpdater(root, options.writingBase(true).maximumWriteId(100));
    for (String v : values) {
        ru.insert(0, new MyRow(v));
    }
    ru.close(false);
    // write a delta
    ru = of.getRecordUpdater(root, options.writingBase(false).minimumWriteId(200).maximumWriteId(200).recordIdColumn(1));
    ru.update(200, new MyRow("update 1", 0, 0, BUCKET_PROPERTY));
    ru.update(200, new MyRow("update 2", 2, 0, BUCKET_PROPERTY));
    ru.update(200, new MyRow("update 3", 3, 0, BUCKET_PROPERTY));
    ru.delete(200, new MyRow("", 7, 0, BUCKET_PROPERTY));
    ru.delete(200, new MyRow("", 8, 0, BUCKET_PROPERTY));
    ru.close(false);
    ValidWriteIdList writeIdList = new ValidReaderWriteIdList("testNewBaseAndDelta:200:" + Long.MAX_VALUE);
    AcidUtils.Directory directory = AcidUtils.getAcidState(root, conf, writeIdList);
    assertEquals(new Path(root, "base_0000100"), directory.getBaseDirectory());
    assertEquals(new Path(root, use130Format ? AcidUtils.deleteDeltaSubdir(200, 200, 0) : AcidUtils.deleteDeltaSubdir(200, 200)), directory.getCurrentDirectories().get(0).getPath());
    assertEquals(new Path(root, use130Format ? AcidUtils.deltaSubdir(200, 200, 0) : AcidUtils.deltaSubdir(200, 200)), directory.getCurrentDirectories().get(1).getPath());
    Path basePath = AcidUtils.createBucketFile(directory.getBaseDirectory(), BUCKET);
    Path deltaPath = AcidUtils.createBucketFile(directory.getCurrentDirectories().get(1).getPath(), BUCKET);
    Path deleteDeltaDir = directory.getCurrentDirectories().get(0).getPath();
    conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, MyRow.getColumnNamesProperty());
    conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, MyRow.getColumnTypesProperty());
    AcidUtils.setAcidOperationalProperties(conf, true, null);
    conf.setBoolean(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL, true);
    // the first "split" is for base/
    Reader baseReader = OrcFile.createReader(basePath, OrcFile.readerOptions(conf));
    OrcRawRecordMerger merger = new OrcRawRecordMerger(conf, true, baseReader, false, BUCKET, createMaximalTxnList(), new Reader.Options(), new Path[] { deleteDeltaDir }, new OrcRawRecordMerger.Options().isCompacting(false));
    assertEquals(null, merger.getMinKey());
    assertEquals(null, merger.getMaxKey());
    RecordIdentifier id = merger.createKey();
    OrcStruct event = merger.createValue();
    assertEquals(true, merger.next(id, event));
    assertEquals(OrcRecordUpdater.DELETE_OPERATION, OrcRecordUpdater.getOperation(event));
    assertEquals(new ReaderKey(0, BUCKET_PROPERTY, 0, 200), id);
    assertNull(OrcRecordUpdater.getRow(event));
    assertEquals(true, merger.next(id, event));
    assertEquals(OrcRecordUpdater.INSERT_OPERATION, OrcRecordUpdater.getOperation(event));
    assertEquals(new ReaderKey(0, BUCKET_PROPERTY, 1, 0), id);
    assertEquals("second", getValue(event));
    assertEquals(true, merger.next(id, event));
    assertEquals(OrcRecordUpdater.DELETE_OPERATION, OrcRecordUpdater.getOperation(event));
    assertEquals(new ReaderKey(0, BUCKET_PROPERTY, 2, 200), id);
    assertNull(OrcRecordUpdater.getRow(event));
    assertEquals(true, merger.next(id, event));
    assertEquals(OrcRecordUpdater.DELETE_OPERATION, OrcRecordUpdater.getOperation(event));
    assertEquals(new ReaderKey(0, BUCKET_PROPERTY, 3, 200), id);
    assertNull(OrcRecordUpdater.getRow(event));
    assertEquals(true, merger.next(id, event));
    assertEquals(OrcRecordUpdater.INSERT_OPERATION, OrcRecordUpdater.getOperation(event));
    assertEquals(new ReaderKey(0, BUCKET_PROPERTY, 4, 0), id);
    assertEquals("fifth", getValue(event));
    assertEquals(true, merger.next(id, event));
    assertEquals(OrcRecordUpdater.INSERT_OPERATION, OrcRecordUpdater.getOperation(event));
    assertEquals(new ReaderKey(0, BUCKET_PROPERTY, 5, 0), id);
    assertEquals("sixth", getValue(event));
    assertEquals(true, merger.next(id, event));
    assertEquals(OrcRecordUpdater.INSERT_OPERATION, OrcRecordUpdater.getOperation(event));
    assertEquals(new ReaderKey(0, BUCKET_PROPERTY, 6, 0), id);
    assertEquals("seventh", getValue(event));
    assertEquals(true, merger.next(id, event));
    assertEquals(OrcRecordUpdater.DELETE_OPERATION, OrcRecordUpdater.getOperation(event));
    assertEquals(new ReaderKey(0, BUCKET_PROPERTY, 7, 200), id);
    assertNull(OrcRecordUpdater.getRow(event));
    assertEquals(true, merger.next(id, event));
    assertEquals(OrcRecordUpdater.DELETE_OPERATION, OrcRecordUpdater.getOperation(event));
    assertEquals(new ReaderKey(0, BUCKET_PROPERTY, 8, 200), id);
    assertNull(OrcRecordUpdater.getRow(event));
    assertEquals(true, merger.next(id, event));
    assertEquals(OrcRecordUpdater.INSERT_OPERATION, OrcRecordUpdater.getOperation(event));
    assertEquals(new ReaderKey(0, BUCKET_PROPERTY, 9, 0), id);
    assertEquals("tenth", getValue(event));
    assertEquals(false, merger.next(id, event));
    merger.close();
    // second "split" is delta_200_200
    baseReader = OrcFile.createReader(deltaPath, OrcFile.readerOptions(conf));
    merger = new OrcRawRecordMerger(conf, true, baseReader, false, BUCKET, createMaximalTxnList(), new Reader.Options(), new Path[] { deleteDeltaDir }, new OrcRawRecordMerger.Options().isCompacting(false));
    assertEquals(null, merger.getMinKey());
    assertEquals(null, merger.getMaxKey());
    assertEquals(true, merger.next(id, event));
    assertEquals(OrcRecordUpdater.DELETE_OPERATION, OrcRecordUpdater.getOperation(event));
    assertEquals(new ReaderKey(0, BUCKET_PROPERTY, 0, 200), id);
    assertNull(OrcRecordUpdater.getRow(event));
    assertEquals(true, merger.next(id, event));
    assertEquals(OrcRecordUpdater.DELETE_OPERATION, OrcRecordUpdater.getOperation(event));
    assertEquals(new ReaderKey(0, BUCKET_PROPERTY, 2, 200), id);
    assertNull(OrcRecordUpdater.getRow(event));
    assertEquals(true, merger.next(id, event));
    assertEquals(OrcRecordUpdater.DELETE_OPERATION, OrcRecordUpdater.getOperation(event));
    assertEquals(new ReaderKey(0, BUCKET_PROPERTY, 3, 200), id);
    assertNull(OrcRecordUpdater.getRow(event));
    assertEquals(true, merger.next(id, event));
    assertEquals(OrcRecordUpdater.DELETE_OPERATION, OrcRecordUpdater.getOperation(event));
    assertEquals(new ReaderKey(0, BUCKET_PROPERTY, 7, 200), id);
    assertNull(OrcRecordUpdater.getRow(event));
    assertEquals(true, merger.next(id, event));
    assertEquals(OrcRecordUpdater.DELETE_OPERATION, OrcRecordUpdater.getOperation(event));
    assertEquals(new ReaderKey(0, BUCKET_PROPERTY, 8, 200), id);
    assertNull(OrcRecordUpdater.getRow(event));
    assertEquals(true, merger.next(id, event));
    assertEquals(OrcRecordUpdater.INSERT_OPERATION, OrcRecordUpdater.getOperation(event));
    assertEquals(new ReaderKey(200, BUCKET_PROPERTY, 0, 200), id);
    assertEquals("update 1", getValue(event));
    assertEquals(true, merger.next(id, event));
    assertEquals(OrcRecordUpdater.INSERT_OPERATION, OrcRecordUpdater.getOperation(event));
    assertEquals(new ReaderKey(200, BUCKET_PROPERTY, 1, 200), id);
    assertEquals("update 2", getValue(event));
    assertEquals(true, merger.next(id, event));
    assertEquals(OrcRecordUpdater.INSERT_OPERATION, OrcRecordUpdater.getOperation(event));
    assertEquals(new ReaderKey(200, BUCKET_PROPERTY, 2, 200), id);
    assertEquals("update 3", getValue(event));
    assertEquals(false, merger.next(id, event));
    merger.close();
    // now run as if it's a minor Compaction so we don't collapse events
    // here there is only 1 "split" since we only have data for 1 bucket
    merger = new OrcRawRecordMerger(conf, false, null, false, BUCKET, createMaximalTxnList(), new Reader.Options(), AcidUtils.getPaths(directory.getCurrentDirectories()), new OrcRawRecordMerger.Options().isCompacting(true));
    assertEquals(null, merger.getMinKey());
    assertEquals(null, merger.getMaxKey());
    assertEquals(true, merger.next(id, event));
    // minor comp, so we ignore 'base_0000100' files so all Deletes end up first since
    // they all modify primordial rows
    assertEquals(OrcRecordUpdater.DELETE_OPERATION, OrcRecordUpdater.getOperation(event));
    assertEquals(new ReaderKey(0, BUCKET_PROPERTY, 0, 200), id);
    assertNull(OrcRecordUpdater.getRow(event));
    assertEquals(true, merger.next(id, event));
    assertEquals(OrcRecordUpdater.DELETE_OPERATION, OrcRecordUpdater.getOperation(event));
    assertEquals(new ReaderKey(0, BUCKET_PROPERTY, 2, 200), id);
    assertNull(OrcRecordUpdater.getRow(event));
    assertEquals(true, merger.next(id, event));
    assertEquals(OrcRecordUpdater.DELETE_OPERATION, OrcRecordUpdater.getOperation(event));
    assertEquals(new ReaderKey(0, BUCKET_PROPERTY, 3, 200), id);
    assertNull(OrcRecordUpdater.getRow(event));
    assertEquals(true, merger.next(id, event));
    assertEquals(OrcRecordUpdater.DELETE_OPERATION, OrcRecordUpdater.getOperation(event));
    assertEquals(new ReaderKey(0, BUCKET_PROPERTY, 7, 200), id);
    assertNull(OrcRecordUpdater.getRow(event));
    assertEquals(true, merger.next(id, event));
    assertEquals(OrcRecordUpdater.DELETE_OPERATION, OrcRecordUpdater.getOperation(event));
    assertEquals(new ReaderKey(0, BUCKET_PROPERTY, 8, 200), id);
    assertNull(OrcRecordUpdater.getRow(event));
    // data from delta_200_200
    assertEquals(true, merger.next(id, event));
    assertEquals(OrcRecordUpdater.INSERT_OPERATION, OrcRecordUpdater.getOperation(event));
    assertEquals(new ReaderKey(200, BUCKET_PROPERTY, 0, 200), id);
    assertEquals("update 1", getValue(event));
    assertEquals(true, merger.next(id, event));
    assertEquals(OrcRecordUpdater.INSERT_OPERATION, OrcRecordUpdater.getOperation(event));
    assertEquals(new ReaderKey(200, BUCKET_PROPERTY, 1, 200), id);
    assertEquals("update 2", getValue(event));
    assertEquals(true, merger.next(id, event));
    assertEquals(OrcRecordUpdater.INSERT_OPERATION, OrcRecordUpdater.getOperation(event));
    assertEquals(new ReaderKey(200, BUCKET_PROPERTY, 2, 200), id);
    assertEquals("update 3", getValue(event));
    assertEquals(false, merger.next(id, event));
    merger.close();
    // now run as if it's a major Compaction so we collapse events
    // here there is only 1 "split" since we only have data for 1 bucket
    baseReader = OrcFile.createReader(basePath, OrcFile.readerOptions(conf));
    merger = new OrcRawRecordMerger(conf, true, null, false, BUCKET, createMaximalTxnList(), new Reader.Options(), AcidUtils.getPaths(directory.getCurrentDirectories()), new OrcRawRecordMerger.Options().isCompacting(true).isMajorCompaction(true).baseDir(new Path(root, "base_0000100")));
    assertEquals(null, merger.getMinKey());
    assertEquals(null, merger.getMaxKey());
    assertEquals(true, merger.next(id, event));
    assertEquals(OrcRecordUpdater.DELETE_OPERATION, OrcRecordUpdater.getOperation(event));
    assertEquals(new ReaderKey(0, BUCKET_PROPERTY, 0, 200), id);
    assertNull(OrcRecordUpdater.getRow(event));
    assertEquals(true, merger.next(id, event));
    assertEquals(OrcRecordUpdater.INSERT_OPERATION, OrcRecordUpdater.getOperation(event));
    assertEquals(new ReaderKey(0, BUCKET_PROPERTY, 1, 0), id);
    assertEquals("second", getValue(event));
    assertEquals(true, merger.next(id, event));
    assertEquals(OrcRecordUpdater.DELETE_OPERATION, OrcRecordUpdater.getOperation(event));
    assertEquals(new ReaderKey(0, BUCKET_PROPERTY, 2, 200), id);
    assertNull(OrcRecordUpdater.getRow(event));
    assertEquals(true, merger.next(id, event));
    assertEquals(OrcRecordUpdater.DELETE_OPERATION, OrcRecordUpdater.getOperation(event));
    assertEquals(new ReaderKey(0, BUCKET_PROPERTY, 3, 200), id);
    assertNull(OrcRecordUpdater.getRow(event));
    assertEquals(true, merger.next(id, event));
    assertEquals(OrcRecordUpdater.INSERT_OPERATION, OrcRecordUpdater.getOperation(event));
    assertEquals(new ReaderKey(0, BUCKET_PROPERTY, 4, 0), id);
    assertEquals("fifth", getValue(event));
    assertEquals(true, merger.next(id, event));
    assertEquals(OrcRecordUpdater.INSERT_OPERATION, OrcRecordUpdater.getOperation(event));
    assertEquals(new ReaderKey(0, BUCKET_PROPERTY, 5, 0), id);
    assertEquals("sixth", getValue(event));
    assertEquals(true, merger.next(id, event));
    assertEquals(OrcRecordUpdater.INSERT_OPERATION, OrcRecordUpdater.getOperation(event));
    assertEquals(new ReaderKey(0, BUCKET_PROPERTY, 6, 0), id);
    assertEquals("seventh", getValue(event));
    assertEquals(true, merger.next(id, event));
    assertEquals(OrcRecordUpdater.DELETE_OPERATION, OrcRecordUpdater.getOperation(event));
    assertEquals(new ReaderKey(0, BUCKET_PROPERTY, 7, 200), id);
    assertNull(OrcRecordUpdater.getRow(event));
    assertEquals(true, merger.next(id, event));
    assertEquals(OrcRecordUpdater.DELETE_OPERATION, OrcRecordUpdater.getOperation(event));
    assertEquals(new ReaderKey(0, BUCKET_PROPERTY, 8, 200), id);
    assertNull(OrcRecordUpdater.getRow(event));
    assertEquals(true, merger.next(id, event));
    assertEquals(OrcRecordUpdater.INSERT_OPERATION, OrcRecordUpdater.getOperation(event));
    assertEquals(new ReaderKey(0, BUCKET_PROPERTY, 9, 0), id);
    assertEquals("tenth", getValue(event));
    // data from delta_200_200
    assertEquals(true, merger.next(id, event));
    assertEquals(OrcRecordUpdater.INSERT_OPERATION, OrcRecordUpdater.getOperation(event));
    assertEquals(new ReaderKey(200, BUCKET_PROPERTY, 0, 200), id);
    assertEquals("update 1", getValue(event));
    assertEquals(true, merger.next(id, event));
    assertEquals(OrcRecordUpdater.INSERT_OPERATION, OrcRecordUpdater.getOperation(event));
    assertEquals(new ReaderKey(200, BUCKET_PROPERTY, 1, 200), id);
    assertEquals("update 2", getValue(event));
    assertEquals(true, merger.next(id, event));
    assertEquals(OrcRecordUpdater.INSERT_OPERATION, OrcRecordUpdater.getOperation(event));
    assertEquals(new ReaderKey(200, BUCKET_PROPERTY, 2, 200), id);
    assertEquals("update 3", getValue(event));
    assertEquals(false, merger.next(id, event));
    merger.close();
    // try ignoring the 200 transaction and make sure it works still
    ValidWriteIdList writeIds = new ValidReaderWriteIdList("testNewBaseAndDelta:2000:200:200");
    // again 1st split is for base/
    baseReader = OrcFile.createReader(basePath, OrcFile.readerOptions(conf));
    merger = new OrcRawRecordMerger(conf, false, baseReader, false, BUCKET, writeIds, new Reader.Options(), new Path[] { deleteDeltaDir }, new OrcRawRecordMerger.Options().isCompacting(false));
    assertEquals(null, merger.getMinKey());
    assertEquals(null, merger.getMaxKey());
    for (int i = 0; i < values.length; ++i) {
        assertEquals(true, merger.next(id, event));
        LOG.info("id = " + id + "event = " + event);
        assertEquals(OrcRecordUpdater.INSERT_OPERATION, OrcRecordUpdater.getOperation(event));
        assertEquals(new ReaderKey(0, BUCKET_PROPERTY, i, 0), id);
        assertEquals(values[i], getValue(event));
    }
    assertEquals(false, merger.next(id, event));
    merger.close();
    // 2nd split is for delta_200_200 which is filtered out entirely by "txns"
    baseReader = OrcFile.createReader(deltaPath, OrcFile.readerOptions(conf));
    merger = new OrcRawRecordMerger(conf, false, baseReader, false, BUCKET, writeIds, new Reader.Options(), new Path[] { deleteDeltaDir }, new OrcRawRecordMerger.Options().isCompacting(false));
    assertEquals(null, merger.getMinKey());
    assertEquals(null, merger.getMaxKey());
    assertEquals(false, merger.next(id, event));
    merger.close();
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) AcidOutputFormat(org.apache.hadoop.hive.ql.io.AcidOutputFormat) RecordIdentifier(org.apache.hadoop.hive.ql.io.RecordIdentifier) FileSystem(org.apache.hadoop.fs.FileSystem) ReaderKey(org.apache.hadoop.hive.ql.io.orc.OrcRawRecordMerger.ReaderKey) RecordUpdater(org.apache.hadoop.hive.ql.io.RecordUpdater) Path(org.apache.hadoop.fs.Path) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) ValidWriteIdList(org.apache.hadoop.hive.common.ValidWriteIdList) ValidReaderWriteIdList(org.apache.hadoop.hive.common.ValidReaderWriteIdList) OrcAcidUtils(org.apache.orc.impl.OrcAcidUtils) AcidUtils(org.apache.hadoop.hive.ql.io.AcidUtils)

Example 8 with RecordUpdater

use of org.apache.hadoop.hive.ql.io.RecordUpdater in project hive by apache.

the class TestOrcRawRecordMerger method testRecordReaderDelta.

/**
 * Test the RecordReader when there is a new base and a delta.
 * @throws Exception
 */
@Test
public void testRecordReaderDelta() throws Exception {
    final int BUCKET = 0;
    Configuration conf = new Configuration();
    OrcOutputFormat of = new OrcOutputFormat();
    FileSystem fs = FileSystem.getLocal(conf);
    Path root = new Path(tmpDir, "testRecordReaderDelta").makeQualified(fs);
    fs.delete(root, true);
    ObjectInspector inspector;
    synchronized (TestOrcFile.class) {
        inspector = ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    // write a delta
    AcidOutputFormat.Options options = new AcidOutputFormat.Options(conf).bucket(BUCKET).inspector(inspector).filesystem(fs).writingBase(false).minimumWriteId(1).maximumWriteId(1).finalDestination(root);
    RecordUpdater ru = of.getRecordUpdater(root, options);
    String[][] values = { new String[] { "a", "b", "c", "d", "e" }, new String[] { "f", "g", "h", "i", "j" } };
    for (int i = 0; i < values[0].length; ++i) {
        ru.insert(1, new MyRow(values[0][i]));
    }
    ru.close(false);
    // write a delta
    options.minimumWriteId(2).maximumWriteId(2);
    ru = of.getRecordUpdater(root, options);
    for (int i = 0; i < values[1].length; ++i) {
        ru.insert(2, new MyRow(values[1][i]));
    }
    ru.close(false);
    InputFormat inf = new OrcInputFormat();
    JobConf job = new JobConf();
    job.set("mapred.min.split.size", "1");
    job.set("mapred.max.split.size", "2");
    job.set("mapred.input.dir", root.toString());
    job.set("bucket_count", "1");
    job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, MyRow.getColumnNamesProperty());
    job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, MyRow.getColumnTypesProperty());
    AcidUtils.setAcidOperationalProperties(job, true, null);
    job.setBoolean(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL, true);
    InputSplit[] splits = inf.getSplits(job, 5);
    assertEquals(2, splits.length);
    org.apache.hadoop.mapred.RecordReader<NullWritable, OrcStruct> rr;
    for (int j = 0; j < splits.length; j++) {
        InputSplit split = splits[j];
        rr = inf.getRecordReader(split, job, Reporter.NULL);
        OrcStruct row = rr.createValue();
        for (int i = 0; i < values[j].length; ++i) {
            System.out.println("Checking " + i);
            String msg = "split[" + j + "] at i=" + i;
            assertEquals(msg, true, rr.next(NullWritable.get(), row));
            assertEquals(msg, values[j][i], row.getFieldValue(0).toString());
        }
        assertEquals(false, rr.next(NullWritable.get(), row));
    }
}
Also used : Path(org.apache.hadoop.fs.Path) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Configuration(org.apache.hadoop.conf.Configuration) NullWritable(org.apache.hadoop.io.NullWritable) AcidOutputFormat(org.apache.hadoop.hive.ql.io.AcidOutputFormat) InputFormat(org.apache.hadoop.mapred.InputFormat) FileSystem(org.apache.hadoop.fs.FileSystem) RecordUpdater(org.apache.hadoop.hive.ql.io.RecordUpdater) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) Test(org.junit.Test)

Example 9 with RecordUpdater

use of org.apache.hadoop.hive.ql.io.RecordUpdater in project hive by apache.

the class TestOrcRawRecordMerger method testRecordReaderOldBaseAndDelta.

/**
 * Test the OrcRecordUpdater with the OrcRawRecordMerger when there is
 * a base and a delta.
 * @throws Exception
 * @see #testRecordReaderNewBaseAndDelta()
 */
@Test
public void testRecordReaderOldBaseAndDelta() throws Exception {
    final int BUCKET = 10;
    Configuration conf = new Configuration();
    OrcOutputFormat of = new OrcOutputFormat();
    FileSystem fs = FileSystem.getLocal(conf);
    Path root = new Path(tmpDir, "testOldBaseAndDelta").makeQualified(fs);
    fs.delete(root, true);
    ObjectInspector inspector;
    synchronized (TestOrcFile.class) {
        inspector = ObjectInspectorFactory.getReflectionObjectInspector(BigRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    // write the base
    MemoryManager mgr = new MemoryManagerImpl(conf) {

        int rowsAddedSinceCheck = 0;

        @Override
        public synchronized void addedRow(int rows) throws IOException {
            rowsAddedSinceCheck += rows;
            if (rowsAddedSinceCheck >= 2) {
                notifyWriters();
                rowsAddedSinceCheck = 0;
            }
        }
    };
    // make 5 stripes with 2 rows each
    Writer writer = OrcFile.createWriter(new Path(root, "0000010_0"), OrcFile.writerOptions(conf).inspector(inspector).fileSystem(fs).blockPadding(false).bufferSize(10000).compress(CompressionKind.NONE).stripeSize(1).memory(mgr).batchSize(2).version(OrcFile.Version.V_0_11));
    String[] values = new String[] { "ignore.1", "0.1", "ignore.2", "ignore.3", "2.0", "2.1", "3.0", "ignore.4", "ignore.5", "ignore.6" };
    for (int i = 0; i < values.length; ++i) {
        writer.addRow(new BigRow(i, i, values[i], i, i));
    }
    writer.close();
    // write a delta
    AcidOutputFormat.Options options = new AcidOutputFormat.Options(conf).writingBase(false).minimumWriteId(1).maximumWriteId(1).bucket(BUCKET).inspector(inspector).filesystem(fs).recordIdColumn(5).finalDestination(root);
    final int BUCKET_PROPERTY = BucketCodec.V1.encode(options);
    RecordUpdater ru = of.getRecordUpdater(root, options);
    values = new String[] { "0.0", null, null, "1.1", null, null, null, "ignore.7" };
    for (int i = 0; i < values.length; ++i) {
        if (values[i] != null) {
            ru.update(1, new BigRow(i, i, values[i], i, i, i, 0, BUCKET_PROPERTY));
        }
    }
    ru.delete(1, new BigRow(9, 0, BUCKET_PROPERTY));
    // this doesn't create a key index presumably because writerOptions are not set on 'options'
    ru.close(false);
    // write a delta
    options = options.minimumWriteId(100).maximumWriteId(100);
    ru = of.getRecordUpdater(root, options);
    values = new String[] { null, null, "1.0", null, null, null, null, "3.1" };
    for (int i = 0; i < values.length - 1; ++i) {
        if (values[i] != null) {
            ru.update(100, new BigRow(i, i, values[i], i, i, i, 0, BUCKET_PROPERTY));
        }
    }
    // do this before next update so that delte_delta is properly sorted
    ru.delete(100, new BigRow(8, 0, BUCKET_PROPERTY));
    // because row 8 was updated and thus has a different RecordIdentifier now
    ru.update(100, new BigRow(7, 7, values[values.length - 1], 7, 7, 2, 1, BUCKET_PROPERTY));
    ru.close(false);
    MyResult[] expected = new MyResult[10];
    int k = 0;
    expected[k++] = new MyResult(0, "0.0");
    expected[k++] = new MyResult(1, "0.1");
    expected[k++] = new MyResult(2, "1.0");
    expected[k++] = new MyResult(3, "1.1");
    expected[k++] = new MyResult(4, "2.0");
    expected[k++] = new MyResult(5, "2.1");
    expected[k++] = new MyResult(6, "3.0");
    expected[k] = new MyResult(7, "3.1");
    InputFormat inf = new OrcInputFormat();
    JobConf job = new JobConf();
    job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, BigRow.getColumnNamesProperty());
    job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, BigRow.getColumnTypesProperty());
    AcidUtils.setAcidOperationalProperties(job, true, null);
    job.setBoolean(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL, true);
    job.set("mapred.min.split.size", "1");
    job.set("mapred.max.split.size", "2");
    job.set("mapred.input.dir", root.toString());
    InputSplit[] splits = inf.getSplits(job, 5);
    assertEquals(7, splits.length);
    org.apache.hadoop.mapred.RecordReader<NullWritable, OrcStruct> rr;
    for (InputSplit split : splits) {
        rr = inf.getRecordReader(split, job, Reporter.NULL);
        NullWritable key = rr.createKey();
        OrcStruct value = rr.createValue();
        while (rr.next(key, value)) {
            MyResult mr = new MyResult(Integer.parseInt(value.getFieldValue(0).toString()), value.getFieldValue(2).toString());
            int i = 0;
            for (; i < expected.length; i++) {
                if (mr.equals(expected[i])) {
                    expected[i] = null;
                    break;
                }
            }
            if (i >= expected.length) {
                // not found
                assertTrue("Found unexpected row: " + mr, false);
            }
        }
    }
    for (MyResult mr : expected) {
        assertTrue("Expected " + mr + " not found in any InputSplit", mr == null);
    }
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) MemoryManagerImpl(org.apache.orc.impl.MemoryManagerImpl) AcidOutputFormat(org.apache.hadoop.hive.ql.io.AcidOutputFormat) FileSystem(org.apache.hadoop.fs.FileSystem) RecordUpdater(org.apache.hadoop.hive.ql.io.RecordUpdater) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) Path(org.apache.hadoop.fs.Path) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) MemoryManager(org.apache.orc.MemoryManager) NullWritable(org.apache.hadoop.io.NullWritable) InputFormat(org.apache.hadoop.mapred.InputFormat) Test(org.junit.Test)

Example 10 with RecordUpdater

use of org.apache.hadoop.hive.ql.io.RecordUpdater in project hive by apache.

the class TestOrcRecordUpdater method testUpdates.

@Test
public void testUpdates() throws Exception {
    Path root = new Path(workDir, "testUpdates");
    Configuration conf = new Configuration();
    FileSystem fs = root.getFileSystem(conf);
    ObjectInspector inspector;
    synchronized (TestOrcFile.class) {
        inspector = ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    int bucket = 20;
    AcidOutputFormat.Options options = new AcidOutputFormat.Options(conf).filesystem(fs).bucket(bucket).writingBase(false).minimumWriteId(100).maximumWriteId(100).inspector(inspector).reporter(Reporter.NULL).recordIdColumn(1).finalDestination(root);
    RecordUpdater updater = new OrcRecordUpdater(root, options);
    updater.update(100, new MyRow("update", 30, 10, bucket));
    updater.delete(100, new MyRow("", 60, 40, bucket));
    assertEquals(-1L, updater.getStats().getRowCount());
    updater.close(false);
    Path bucketPath = AcidUtils.createFilename(root, options);
    Reader reader = OrcFile.createReader(bucketPath, new OrcFile.ReaderOptions(conf).filesystem(fs));
    assertEquals(1, reader.getNumberOfRows());
    RecordReader rows = reader.rows();
    // check the contents of the file
    assertEquals(true, rows.hasNext());
    OrcStruct row = (OrcStruct) rows.next(null);
    assertEquals(OrcRecordUpdater.INSERT_OPERATION, OrcRecordUpdater.getOperation(row));
    assertEquals(100, OrcRecordUpdater.getCurrentTransaction(row));
    assertEquals(100, OrcRecordUpdater.getOriginalTransaction(row));
    int bucketProperty = OrcRecordUpdater.getBucket(row);
    assertEquals(bucket, BucketCodec.determineVersion(bucketProperty).decodeWriterId(bucketProperty));
    assertEquals(0, OrcRecordUpdater.getRowId(row));
    assertEquals("update", OrcRecordUpdater.getRow(row).getFieldValue(0).toString());
    rows.close();
    options.writingDeleteDelta(true);
    bucketPath = AcidUtils.createFilename(root, options);
    reader = OrcFile.createReader(bucketPath, new OrcFile.ReaderOptions(conf).filesystem(fs));
    assertEquals(2, reader.getNumberOfRows());
    rows = reader.rows();
    assertEquals(true, rows.hasNext());
    row = (OrcStruct) rows.next(null);
    assertEquals(OrcRecordUpdater.DELETE_OPERATION, OrcRecordUpdater.getOperation(row));
    assertEquals(100, OrcRecordUpdater.getCurrentTransaction(row));
    assertEquals(10, OrcRecordUpdater.getOriginalTransaction(row));
    bucketProperty = OrcRecordUpdater.getBucket(row);
    assertEquals(bucket, BucketCodec.determineVersion(bucketProperty).decodeWriterId(bucketProperty));
    assertEquals(30, OrcRecordUpdater.getRowId(row));
    assertNull(OrcRecordUpdater.getRow(row));
    assertEquals(true, rows.hasNext());
    row = (OrcStruct) rows.next(null);
    assertEquals(OrcRecordUpdater.DELETE_OPERATION, OrcRecordUpdater.getOperation(row));
    assertEquals(100, OrcRecordUpdater.getCurrentTransaction(row));
    assertEquals(40, OrcRecordUpdater.getOriginalTransaction(row));
    bucketProperty = OrcRecordUpdater.getBucket(row);
    assertEquals(bucket, BucketCodec.determineVersion(bucketProperty).decodeWriterId(bucketProperty));
    assertEquals(60, OrcRecordUpdater.getRowId(row));
    assertNull(OrcRecordUpdater.getRow(row));
    assertEquals(false, rows.hasNext());
}
Also used : Path(org.apache.hadoop.fs.Path) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) Configuration(org.apache.hadoop.conf.Configuration) AcidOutputFormat(org.apache.hadoop.hive.ql.io.AcidOutputFormat) FileSystem(org.apache.hadoop.fs.FileSystem) RecordUpdater(org.apache.hadoop.hive.ql.io.RecordUpdater) Test(org.junit.Test)

Aggregations

RecordUpdater (org.apache.hadoop.hive.ql.io.RecordUpdater)13 AcidOutputFormat (org.apache.hadoop.hive.ql.io.AcidOutputFormat)11 Path (org.apache.hadoop.fs.Path)10 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)10 Configuration (org.apache.hadoop.conf.Configuration)9 FileSystem (org.apache.hadoop.fs.FileSystem)9 Test (org.junit.Test)9 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)8 InputSplit (org.apache.hadoop.mapred.InputSplit)6 JobConf (org.apache.hadoop.mapred.JobConf)5 NullWritable (org.apache.hadoop.io.NullWritable)4 InputFormat (org.apache.hadoop.mapred.InputFormat)4 RecordWriter (org.apache.hadoop.mapred.RecordWriter)2 MemoryManager (org.apache.orc.MemoryManager)2 MemoryManagerImpl (org.apache.orc.impl.MemoryManagerImpl)2 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1 DataInputStream (java.io.DataInputStream)1 IOException (java.io.IOException)1 PrintStream (java.io.PrintStream)1 Properties (java.util.Properties)1