Search in sources :

Example 1 with RecordUpdater

use of org.apache.hadoop.hive.ql.io.RecordUpdater in project hive by apache.

the class TestOrcRawRecordMerger method testRecordReaderIncompleteDelta.

/**
   * 
   * @param use130Format true means use delta_0001_0001_0000 format, else delta_0001_00001
   */
private void testRecordReaderIncompleteDelta(boolean use130Format) throws Exception {
    final int BUCKET = 1;
    Configuration conf = new Configuration();
    OrcOutputFormat of = new OrcOutputFormat();
    FileSystem fs = FileSystem.getLocal(conf).getRaw();
    Path root = new Path(tmpDir, "testRecordReaderIncompleteDelta").makeQualified(fs);
    fs.delete(root, true);
    ObjectInspector inspector;
    synchronized (TestOrcFile.class) {
        inspector = ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    // write a base
    AcidOutputFormat.Options options = new AcidOutputFormat.Options(conf).writingBase(true).minimumTransactionId(0).maximumTransactionId(0).bucket(BUCKET).inspector(inspector).filesystem(fs).finalDestination(root);
    if (!use130Format) {
        options.statementId(-1);
    }
    RecordUpdater ru = of.getRecordUpdater(root, options);
    String[] values = new String[] { "1", "2", "3", "4", "5" };
    for (int i = 0; i < values.length; ++i) {
        ru.insert(0, new MyRow(values[i]));
    }
    ru.close(false);
    // write a delta
    options.writingBase(false).minimumTransactionId(10).maximumTransactionId(19);
    ru = of.getRecordUpdater(root, options);
    values = new String[] { "6", "7", "8" };
    for (int i = 0; i < values.length; ++i) {
        ru.insert(1, new MyRow(values[i]));
    }
    InputFormat inf = new OrcInputFormat();
    JobConf job = new JobConf();
    job.set("mapred.input.dir", root.toString());
    job.set("bucket_count", "2");
    job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, MyRow.getColumnNamesProperty());
    job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, MyRow.getColumnTypesProperty());
    HiveConf.setBoolVar(job, HiveConf.ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN, true);
    // read the keys before the delta is flushed
    InputSplit[] splits = inf.getSplits(job, 1);
    assertEquals(2, splits.length);
    org.apache.hadoop.mapred.RecordReader<NullWritable, OrcStruct> rr = inf.getRecordReader(splits[0], job, Reporter.NULL);
    NullWritable key = rr.createKey();
    OrcStruct value = rr.createValue();
    System.out.println("Looking at split " + splits[0]);
    for (int i = 1; i < 6; ++i) {
        System.out.println("Checking row " + i);
        assertEquals(true, rr.next(key, value));
        assertEquals(Integer.toString(i), value.getFieldValue(0).toString());
    }
    assertEquals(false, rr.next(key, value));
    ru.flush();
    ru.flush();
    values = new String[] { "9", "10" };
    for (int i = 0; i < values.length; ++i) {
        ru.insert(3, new MyRow(values[i]));
    }
    ru.flush();
    splits = inf.getSplits(job, 1);
    assertEquals(2, splits.length);
    rr = inf.getRecordReader(splits[0], job, Reporter.NULL);
    Path sideFile = new Path(root + "/" + (use130Format ? AcidUtils.deltaSubdir(10, 19, 0) : AcidUtils.deltaSubdir(10, 19)) + "/bucket_00001_flush_length");
    assertEquals(true, fs.exists(sideFile));
    assertEquals(24, fs.getFileStatus(sideFile).getLen());
    for (int i = 1; i < 11; ++i) {
        assertEquals(true, rr.next(key, value));
        assertEquals(Integer.toString(i), value.getFieldValue(0).toString());
    }
    assertEquals(false, rr.next(key, value));
}
Also used : Path(org.apache.hadoop.fs.Path) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Configuration(org.apache.hadoop.conf.Configuration) NullWritable(org.apache.hadoop.io.NullWritable) AcidOutputFormat(org.apache.hadoop.hive.ql.io.AcidOutputFormat) InputFormat(org.apache.hadoop.mapred.InputFormat) FileSystem(org.apache.hadoop.fs.FileSystem) RecordUpdater(org.apache.hadoop.hive.ql.io.RecordUpdater) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit)

Example 2 with RecordUpdater

use of org.apache.hadoop.hive.ql.io.RecordUpdater in project hive by apache.

the class TestOrcRawRecordMerger method testRecordReaderNewBaseAndDelta.

/**
   * Test the RecordReader when there is a new base and a delta.
   * @throws Exception
   */
@Test
public void testRecordReaderNewBaseAndDelta() throws Exception {
    final int BUCKET = 11;
    Configuration conf = new Configuration();
    OrcOutputFormat of = new OrcOutputFormat();
    FileSystem fs = FileSystem.getLocal(conf);
    Path root = new Path(tmpDir, "testRecordReaderNewBaseAndDelta").makeQualified(fs);
    fs.delete(root, true);
    ObjectInspector inspector;
    synchronized (TestOrcFile.class) {
        inspector = ObjectInspectorFactory.getReflectionObjectInspector(BigRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    // write the base
    MemoryManager mgr = new MemoryManager(conf) {

        int rowsAddedSinceCheck = 0;

        @Override
        public synchronized void addedRow(int rows) throws IOException {
            rowsAddedSinceCheck += rows;
            if (rowsAddedSinceCheck >= 2) {
                notifyWriters();
                rowsAddedSinceCheck = 0;
            }
        }
    };
    // make 5 stripes with 2 rows each
    OrcRecordUpdater.OrcOptions options = (OrcRecordUpdater.OrcOptions) new OrcRecordUpdater.OrcOptions(conf).writingBase(true).minimumTransactionId(0).maximumTransactionId(0).bucket(BUCKET).inspector(inspector).filesystem(fs);
    options.orcOptions(OrcFile.writerOptions(conf).stripeSize(1).blockPadding(false).compress(CompressionKind.NONE).memory(mgr).batchSize(2));
    options.finalDestination(root);
    RecordUpdater ru = of.getRecordUpdater(root, options);
    String[] values = new String[] { "ignore.1", "0.1", "ignore.2", "ignore.3", "2.0", "2.1", "3.0", "ignore.4", "ignore.5", "ignore.6" };
    for (int i = 0; i < values.length; ++i) {
        ru.insert(0, new BigRow(i, i, values[i], i, i));
    }
    ru.close(false);
    // write a delta
    options.writingBase(false).minimumTransactionId(1).maximumTransactionId(1).recordIdColumn(5);
    ru = of.getRecordUpdater(root, options);
    values = new String[] { "0.0", null, null, "1.1", null, null, null, "ignore.7" };
    for (int i = 0; i < values.length; ++i) {
        if (values[i] != null) {
            ru.update(1, new BigRow(i, i, values[i], i, i, i, 0, BUCKET));
        }
    }
    ru.delete(100, new BigRow(9, 0, BUCKET));
    ru.close(false);
    // write a delta
    options.minimumTransactionId(2).maximumTransactionId(2);
    ru = of.getRecordUpdater(root, options);
    values = new String[] { null, null, "1.0", null, null, null, null, "3.1" };
    for (int i = 0; i < values.length; ++i) {
        if (values[i] != null) {
            ru.update(2, new BigRow(i, i, values[i], i, i, i, 0, BUCKET));
        }
    }
    ru.delete(100, new BigRow(8, 0, BUCKET));
    ru.close(false);
    InputFormat inf = new OrcInputFormat();
    JobConf job = new JobConf();
    job.set("mapred.min.split.size", "1");
    job.set("mapred.max.split.size", "2");
    job.set("mapred.input.dir", root.toString());
    job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, BigRow.getColumnNamesProperty());
    job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, BigRow.getColumnTypesProperty());
    HiveConf.setBoolVar(job, HiveConf.ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN, true);
    InputSplit[] splits = inf.getSplits(job, 5);
    assertEquals(5, splits.length);
    org.apache.hadoop.mapred.RecordReader<NullWritable, OrcStruct> rr;
    // loop through the 5 splits and read each
    for (int i = 0; i < 4; ++i) {
        System.out.println("starting split " + i + " = " + splits[i]);
        rr = inf.getRecordReader(splits[i], job, Reporter.NULL);
        NullWritable key = rr.createKey();
        OrcStruct value = rr.createValue();
        // there should be exactly two rows per a split
        for (int j = 0; j < 2; ++j) {
            System.out.println("i = " + i + ", j = " + j);
            assertEquals(true, rr.next(key, value));
            System.out.println("record = " + value);
            assertEquals(i + "." + j, value.getFieldValue(2).toString());
        }
        assertEquals(false, rr.next(key, value));
    }
    rr = inf.getRecordReader(splits[4], job, Reporter.NULL);
    assertEquals(false, rr.next(rr.createKey(), rr.createValue()));
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) FileSystem(org.apache.hadoop.fs.FileSystem) RecordUpdater(org.apache.hadoop.hive.ql.io.RecordUpdater) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) Path(org.apache.hadoop.fs.Path) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) MemoryManager(org.apache.orc.impl.MemoryManager) NullWritable(org.apache.hadoop.io.NullWritable) InputFormat(org.apache.hadoop.mapred.InputFormat) Test(org.junit.Test)

Example 3 with RecordUpdater

use of org.apache.hadoop.hive.ql.io.RecordUpdater in project hive by apache.

the class TestOrcRecordUpdater method testWriter.

@Test
public void testWriter() throws Exception {
    Path root = new Path(workDir, "testWriter");
    Configuration conf = new Configuration();
    // Must use raw local because the checksummer doesn't honor flushes.
    FileSystem fs = FileSystem.getLocal(conf).getRaw();
    ObjectInspector inspector;
    synchronized (TestOrcFile.class) {
        inspector = ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    AcidOutputFormat.Options options = new AcidOutputFormat.Options(conf).filesystem(fs).bucket(10).writingBase(false).minimumTransactionId(10).maximumTransactionId(19).inspector(inspector).reporter(Reporter.NULL).finalDestination(root);
    RecordUpdater updater = new OrcRecordUpdater(root, options);
    updater.insert(11, new MyRow("first"));
    updater.insert(11, new MyRow("second"));
    updater.insert(11, new MyRow("third"));
    updater.flush();
    updater.insert(12, new MyRow("fourth"));
    updater.insert(12, new MyRow("fifth"));
    updater.flush();
    // Check the stats
    assertEquals(5L, updater.getStats().getRowCount());
    Path bucketPath = AcidUtils.createFilename(root, options);
    Path sidePath = OrcAcidUtils.getSideFile(bucketPath);
    DataInputStream side = fs.open(sidePath);
    // read the stopping point for the first flush and make sure we only see
    // 3 rows
    long len = side.readLong();
    Reader reader = OrcFile.createReader(bucketPath, new OrcFile.ReaderOptions(conf).filesystem(fs).maxLength(len));
    assertEquals(3, reader.getNumberOfRows());
    // read the second flush and make sure we see all 5 rows
    len = side.readLong();
    side.close();
    reader = OrcFile.createReader(bucketPath, new OrcFile.ReaderOptions(conf).filesystem(fs).maxLength(len));
    assertEquals(5, reader.getNumberOfRows());
    RecordReader rows = reader.rows();
    // check the contents of the file
    assertEquals(true, rows.hasNext());
    OrcStruct row = (OrcStruct) rows.next(null);
    assertEquals(OrcRecordUpdater.INSERT_OPERATION, OrcRecordUpdater.getOperation(row));
    assertEquals(11, OrcRecordUpdater.getCurrentTransaction(row));
    assertEquals(11, OrcRecordUpdater.getOriginalTransaction(row));
    assertEquals(10, OrcRecordUpdater.getBucket(row));
    assertEquals(0, OrcRecordUpdater.getRowId(row));
    assertEquals("first", OrcRecordUpdater.getRow(row).getFieldValue(0).toString());
    assertEquals(true, rows.hasNext());
    row = (OrcStruct) rows.next(null);
    assertEquals(1, OrcRecordUpdater.getRowId(row));
    assertEquals(10, OrcRecordUpdater.getBucket(row));
    assertEquals("second", OrcRecordUpdater.getRow(row).getFieldValue(0).toString());
    assertEquals(true, rows.hasNext());
    row = (OrcStruct) rows.next(null);
    assertEquals(2, OrcRecordUpdater.getRowId(row));
    assertEquals(10, OrcRecordUpdater.getBucket(row));
    assertEquals("third", OrcRecordUpdater.getRow(row).getFieldValue(0).toString());
    assertEquals(true, rows.hasNext());
    row = (OrcStruct) rows.next(null);
    assertEquals(12, OrcRecordUpdater.getCurrentTransaction(row));
    assertEquals(12, OrcRecordUpdater.getOriginalTransaction(row));
    assertEquals(10, OrcRecordUpdater.getBucket(row));
    assertEquals(0, OrcRecordUpdater.getRowId(row));
    assertEquals("fourth", OrcRecordUpdater.getRow(row).getFieldValue(0).toString());
    assertEquals(true, rows.hasNext());
    row = (OrcStruct) rows.next(null);
    assertEquals(1, OrcRecordUpdater.getRowId(row));
    assertEquals("fifth", OrcRecordUpdater.getRow(row).getFieldValue(0).toString());
    assertEquals(false, rows.hasNext());
    // add one more record and close
    updater.insert(20, new MyRow("sixth"));
    updater.close(false);
    reader = OrcFile.createReader(bucketPath, new OrcFile.ReaderOptions(conf).filesystem(fs));
    assertEquals(6, reader.getNumberOfRows());
    assertEquals(6L, updater.getStats().getRowCount());
    assertEquals(false, fs.exists(sidePath));
}
Also used : Path(org.apache.hadoop.fs.Path) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) Configuration(org.apache.hadoop.conf.Configuration) DataInputStream(java.io.DataInputStream) AcidOutputFormat(org.apache.hadoop.hive.ql.io.AcidOutputFormat) FileSystem(org.apache.hadoop.fs.FileSystem) RecordUpdater(org.apache.hadoop.hive.ql.io.RecordUpdater) Test(org.junit.Test)

Example 4 with RecordUpdater

use of org.apache.hadoop.hive.ql.io.RecordUpdater in project hive by apache.

the class TestOrcRawRecordMerger method testRecordReaderDelta.

/**
   * Test the RecordReader when there is a new base and a delta.
   * @throws Exception
   */
@Test
public void testRecordReaderDelta() throws Exception {
    final int BUCKET = 0;
    Configuration conf = new Configuration();
    OrcOutputFormat of = new OrcOutputFormat();
    FileSystem fs = FileSystem.getLocal(conf);
    Path root = new Path(tmpDir, "testRecordReaderDelta").makeQualified(fs);
    fs.delete(root, true);
    ObjectInspector inspector;
    synchronized (TestOrcFile.class) {
        inspector = ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    // write a delta
    AcidOutputFormat.Options options = new AcidOutputFormat.Options(conf).bucket(BUCKET).inspector(inspector).filesystem(fs).writingBase(false).minimumTransactionId(1).maximumTransactionId(1).finalDestination(root);
    RecordUpdater ru = of.getRecordUpdater(root, options);
    String[] values = new String[] { "a", "b", "c", "d", "e" };
    for (int i = 0; i < values.length; ++i) {
        ru.insert(1, new MyRow(values[i]));
    }
    ru.close(false);
    // write a delta
    options.minimumTransactionId(2).maximumTransactionId(2);
    ru = of.getRecordUpdater(root, options);
    values = new String[] { "f", "g", "h", "i", "j" };
    for (int i = 0; i < values.length; ++i) {
        ru.insert(2, new MyRow(values[i]));
    }
    ru.close(false);
    InputFormat inf = new OrcInputFormat();
    JobConf job = new JobConf();
    job.set("mapred.min.split.size", "1");
    job.set("mapred.max.split.size", "2");
    job.set("mapred.input.dir", root.toString());
    job.set("bucket_count", "1");
    job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, MyRow.getColumnNamesProperty());
    job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, MyRow.getColumnTypesProperty());
    HiveConf.setBoolVar(job, HiveConf.ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN, true);
    InputSplit[] splits = inf.getSplits(job, 5);
    assertEquals(1, splits.length);
    org.apache.hadoop.mapred.RecordReader<NullWritable, OrcStruct> rr;
    rr = inf.getRecordReader(splits[0], job, Reporter.NULL);
    values = new String[] { "a", "b", "c", "d", "e", "f", "g", "h", "i", "j" };
    OrcStruct row = rr.createValue();
    for (int i = 0; i < values.length; ++i) {
        System.out.println("Checking " + i);
        assertEquals(true, rr.next(NullWritable.get(), row));
        assertEquals(values[i], row.getFieldValue(0).toString());
    }
    assertEquals(false, rr.next(NullWritable.get(), row));
}
Also used : Path(org.apache.hadoop.fs.Path) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Configuration(org.apache.hadoop.conf.Configuration) NullWritable(org.apache.hadoop.io.NullWritable) AcidOutputFormat(org.apache.hadoop.hive.ql.io.AcidOutputFormat) InputFormat(org.apache.hadoop.mapred.InputFormat) FileSystem(org.apache.hadoop.fs.FileSystem) RecordUpdater(org.apache.hadoop.hive.ql.io.RecordUpdater) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) Test(org.junit.Test)

Example 5 with RecordUpdater

use of org.apache.hadoop.hive.ql.io.RecordUpdater in project hive by apache.

the class AbstractRecordWriter method closeBatch.

@Override
public void closeBatch() throws StreamingIOFailure {
    boolean haveError = false;
    for (RecordUpdater updater : updaters) {
        if (updater != null) {
            try {
                //try not to leave any files open
                updater.close(false);
            } catch (Exception ex) {
                haveError = true;
                LOG.error("Unable to close " + updater + " due to: " + ex.getMessage(), ex);
            }
        }
    }
    updaters.clear();
    if (haveError) {
        throw new StreamingIOFailure("Encountered errors while closing (see logs) " + getWatermark());
    }
}
Also used : RecordUpdater(org.apache.hadoop.hive.ql.io.RecordUpdater) MetaException(org.apache.hadoop.hive.metastore.api.MetaException) TException(org.apache.thrift.TException) IOException(java.io.IOException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) NoSuchObjectException(org.apache.hadoop.hive.metastore.api.NoSuchObjectException)

Aggregations

RecordUpdater (org.apache.hadoop.hive.ql.io.RecordUpdater)10 Path (org.apache.hadoop.fs.Path)9 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)9 Configuration (org.apache.hadoop.conf.Configuration)8 FileSystem (org.apache.hadoop.fs.FileSystem)8 AcidOutputFormat (org.apache.hadoop.hive.ql.io.AcidOutputFormat)8 Test (org.junit.Test)6 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)5 JobConf (org.apache.hadoop.mapred.JobConf)5 NullWritable (org.apache.hadoop.io.NullWritable)4 InputFormat (org.apache.hadoop.mapred.InputFormat)4 InputSplit (org.apache.hadoop.mapred.InputSplit)4 MemoryManager (org.apache.orc.impl.MemoryManager)2 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1 DataInputStream (java.io.DataInputStream)1 IOException (java.io.IOException)1 PrintStream (java.io.PrintStream)1 Properties (java.util.Properties)1 ValidReadTxnList (org.apache.hadoop.hive.common.ValidReadTxnList)1 ValidTxnList (org.apache.hadoop.hive.common.ValidTxnList)1