Search in sources :

Example 11 with RecordUpdater

use of org.apache.hadoop.hive.ql.io.RecordUpdater in project hive by apache.

the class TestOrcRecordUpdater method testWriterTblProperties.

@Test
public void testWriterTblProperties() throws Exception {
    Path root = new Path(workDir, "testWriterTblProperties");
    Configuration conf = new Configuration();
    // Must use raw local because the checksummer doesn't honor flushes.
    FileSystem fs = FileSystem.getLocal(conf).getRaw();
    ObjectInspector inspector;
    synchronized (TestOrcFile.class) {
        inspector = ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    Properties tblProps = new Properties();
    tblProps.setProperty("orc.compress", "SNAPPY");
    tblProps.setProperty("orc.compress.size", "8192");
    HiveConf.setIntVar(conf, HiveConf.ConfVars.HIVE_ORC_BASE_DELTA_RATIO, 4);
    AcidOutputFormat.Options options = new AcidOutputFormat.Options(conf).filesystem(fs).bucket(10).writingBase(false).minimumWriteId(10).maximumWriteId(19).inspector(inspector).reporter(Reporter.NULL).finalDestination(root).tableProperties(tblProps);
    RecordUpdater updater = new OrcRecordUpdater(root, options);
    updater.insert(11, new MyRow("first"));
    updater.insert(11, new MyRow("second"));
    updater.insert(11, new MyRow("third"));
    updater.flush();
    updater.insert(12, new MyRow("fourth"));
    updater.insert(12, new MyRow("fifth"));
    updater.flush();
    PrintStream origOut = System.out;
    ByteArrayOutputStream myOut = new ByteArrayOutputStream();
    System.setOut(new PrintStream(myOut));
    FileDump.main(new String[] { root.toUri().toString() });
    System.out.flush();
    String outDump = new String(myOut.toByteArray());
    assertEquals(true, outDump.contains("Compression: SNAPPY"));
    assertEquals(true, outDump.contains("Compression size: 2048"));
    System.setOut(origOut);
    updater.close(false);
}
Also used : Path(org.apache.hadoop.fs.Path) PrintStream(java.io.PrintStream) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) Configuration(org.apache.hadoop.conf.Configuration) ByteArrayOutputStream(java.io.ByteArrayOutputStream) Properties(java.util.Properties) AcidOutputFormat(org.apache.hadoop.hive.ql.io.AcidOutputFormat) FileSystem(org.apache.hadoop.fs.FileSystem) RecordUpdater(org.apache.hadoop.hive.ql.io.RecordUpdater) Test(org.junit.Test)

Example 12 with RecordUpdater

use of org.apache.hadoop.hive.ql.io.RecordUpdater in project hive by apache.

the class TestVectorizedOrcAcidRowBatchReader method setup.

@Before
public void setup() throws Exception {
    conf = new JobConf();
    conf.set("bucket_count", "1");
    conf.set(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL, "true");
    conf.setBoolean(HiveConf.ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN.varname, true);
    conf.set(hive_metastoreConstants.TABLE_TRANSACTIONAL_PROPERTIES, "default");
    conf.setInt(HiveConf.ConfVars.HIVE_TXN_OPERATIONAL_PROPERTIES.varname, AcidUtils.AcidOperationalProperties.getDefault().toInt());
    conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, DummyRow.getColumnNamesProperty());
    conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, DummyRow.getColumnTypesProperty());
    conf.setBoolean(HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED.varname, true);
    conf.set(HiveConf.ConfVars.HIVE_ORC_SPLIT_STRATEGY.varname, "BI");
    Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test" + File.separator + "tmp"));
    root = new Path(workDir, "TestVectorizedOrcAcidRowBatch.testDump");
    fs = root.getFileSystem(conf);
    root = fs.makeQualified(root);
    fs.delete(root, true);
    ObjectInspector inspector;
    synchronized (TestOrcFile.class) {
        inspector = ObjectInspectorFactory.getReflectionObjectInspector(DummyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    int bucket = 0;
    AcidOutputFormat.Options options = new AcidOutputFormat.Options(conf).filesystem(fs).bucket(bucket).writingBase(false).minimumWriteId(1).maximumWriteId(NUM_OWID).inspector(inspector).reporter(Reporter.NULL).recordIdColumn(1).finalDestination(root);
    RecordUpdater updater = new OrcRecordUpdater(root, options);
    // Create a single insert delta with 150,000 rows, with 15000 rowIds per original transaction id.
    for (long i = 1; i <= NUM_OWID; ++i) {
        for (long j = 0; j < NUM_ROWID_PER_OWID; ++j) {
            long payload = (i - 1) * NUM_ROWID_PER_OWID + j;
            updater.insert(i, new DummyRow(payload, j, i, bucket));
        }
    }
    updater.close(false);
    // Now create three types of delete deltas- first has rowIds divisible by 2 but not by 3,
    // second has rowIds divisible by 3 but not by 2, and the third has rowIds divisible by
    // both 2 and 3. This should produce delete deltas that will thoroughly test the sort-merge
    // logic when the delete events in the delete delta files interleave in the sort order.
    // Create a delete delta that has rowIds divisible by 2 but not by 3. This will produce
    // a delete delta file with 50,000 delete events.
    long currTxnId = NUM_OWID + 1;
    options.minimumWriteId(currTxnId).maximumWriteId(currTxnId);
    updater = new OrcRecordUpdater(root, options);
    for (long i = 1; i <= NUM_OWID; ++i) {
        for (long j = 0; j < NUM_ROWID_PER_OWID; j += 1) {
            if (j % 2 == 0 && j % 3 != 0) {
                updater.delete(currTxnId, new DummyRow(-1, j, i, bucket));
            }
        }
    }
    updater.close(false);
    // Now, create a delete delta that has rowIds divisible by 3 but not by 2. This will produce
    // a delete delta file with 25,000 delete events.
    currTxnId = NUM_OWID + 2;
    options.minimumWriteId(currTxnId).maximumWriteId(currTxnId);
    updater = new OrcRecordUpdater(root, options);
    for (long i = 1; i <= NUM_OWID; ++i) {
        for (long j = 0; j < NUM_ROWID_PER_OWID; j += 1) {
            if (j % 2 != 0 && j % 3 == 0) {
                updater.delete(currTxnId, new DummyRow(-1, j, i, bucket));
            }
        }
    }
    updater.close(false);
    // Now, create a delete delta that has rowIds divisible by both 3 and 2. This will produce
    // a delete delta file with 25,000 delete events.
    currTxnId = NUM_OWID + 3;
    options.minimumWriteId(currTxnId).maximumWriteId(currTxnId);
    updater = new OrcRecordUpdater(root, options);
    for (long i = 1; i <= NUM_OWID; ++i) {
        for (long j = 0; j < NUM_ROWID_PER_OWID; j += 1) {
            if (j % 2 == 0 && j % 3 == 0) {
                updater.delete(currTxnId, new DummyRow(-1, j, i, bucket));
            }
        }
    }
    updater.close(false);
}
Also used : Path(org.apache.hadoop.fs.Path) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) JobConf(org.apache.hadoop.mapred.JobConf) RecordUpdater(org.apache.hadoop.hive.ql.io.RecordUpdater) AcidOutputFormat(org.apache.hadoop.hive.ql.io.AcidOutputFormat) Before(org.junit.Before)

Example 13 with RecordUpdater

use of org.apache.hadoop.hive.ql.io.RecordUpdater in project hive by apache.

the class TestInputOutputFormat method testACIDReaderNoFooterSerializeWithDeltas.

@Test
public void testACIDReaderNoFooterSerializeWithDeltas() throws Exception {
    conf.set("fs.defaultFS", "mock:///");
    conf.set("fs.mock.impl", MockFileSystem.class.getName());
    FileSystem fs = FileSystem.get(conf);
    MockPath mockPath = new MockPath(fs, "mock:///mocktable7");
    conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, MyRow.getColumnNamesProperty());
    conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, MyRow.getColumnTypesProperty());
    conf.set("hive.orc.splits.include.file.footer", "false");
    conf.set("mapred.input.dir", mockPath.toString());
    StructObjectInspector inspector;
    synchronized (TestOrcFile.class) {
        inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(MyRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    Writer writer = OrcFile.createWriter(new Path(mockPath + "/0_0"), OrcFile.writerOptions(conf).blockPadding(false).bufferSize(1024).inspector(inspector));
    for (int i = 0; i < 10; ++i) {
        writer.addRow(new MyRow(i, 2 * i));
    }
    writer.close();
    AcidOutputFormat.Options options = new AcidOutputFormat.Options(conf).bucket(1).minimumWriteId(1).maximumWriteId(1).inspector(inspector).finalDestination(mockPath);
    OrcOutputFormat of = new OrcOutputFormat();
    RecordUpdater ru = of.getRecordUpdater(mockPath, options);
    for (int i = 0; i < 10; ++i) {
        ru.insert(options.getMinimumWriteId(), new MyRow(i, 2 * i));
    }
    // this deletes the side file
    ru.close(false);
    // set up props for read
    conf.setBoolean(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL, true);
    AcidUtils.setAcidOperationalProperties(conf, true, null);
    OrcInputFormat orcInputFormat = new OrcInputFormat();
    InputSplit[] splits = orcInputFormat.getSplits(conf, 2);
    assertEquals(2, splits.length);
    int readOpsBefore = -1;
    for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
        if (statistics.getScheme().equalsIgnoreCase("mock")) {
            readOpsBefore = statistics.getReadOps();
        }
    }
    assertTrue("MockFS has stats. Read ops not expected to be -1", readOpsBefore != -1);
    for (InputSplit split : splits) {
        assertTrue("OrcSplit is expected", split instanceof OrcSplit);
        // ETL strategies will have start=3 (start of first stripe)
        assertTrue(split.toString().contains("start=3"));
        assertTrue(split.toString().contains("hasFooter=false"));
        assertTrue(split.toString().contains("hasBase=true"));
        assertFalse("No footer serialize test for ACID reader, hasFooter is not expected in" + " orc splits.", ((OrcSplit) split).hasFooter());
        orcInputFormat.getRecordReader(split, conf, Reporter.NULL);
    }
    int readOpsDelta = -1;
    for (FileSystem.Statistics statistics : FileSystem.getAllStatistics()) {
        if (statistics.getScheme().equalsIgnoreCase("mock")) {
            readOpsDelta = statistics.getReadOps() - readOpsBefore;
        }
    }
    // call-1: open(mock:/mocktable7/0_0)
    // call-2: open(mock:/mocktable7/0_0)
    // call-3: listLocatedFileStatuses(mock:/mocktable7)
    // call-4: getFileStatus(mock:/mocktable7/delta_0000001_0000001_0000/_metadata_acid)
    // call-5: open(mock:/mocktable7/delta_0000001_0000001_0000/bucket_00001)
    // call-6: getFileStatus(mock:/mocktable7/delta_0000001_0000001_0000/_metadata_acid)
    // call-7: open(mock:/mocktable7/delta_0000001_0000001_0000/bucket_00001)
    assertEquals(7, readOpsDelta);
    // revert back to local fs
    conf.set("fs.defaultFS", "file:///");
}
Also used : AcidOutputFormat(org.apache.hadoop.hive.ql.io.AcidOutputFormat) RecordUpdater(org.apache.hadoop.hive.ql.io.RecordUpdater) InputSplit(org.apache.hadoop.mapred.InputSplit) RecordWriter(org.apache.hadoop.mapred.RecordWriter) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Test(org.junit.Test)

Aggregations

RecordUpdater (org.apache.hadoop.hive.ql.io.RecordUpdater)13 AcidOutputFormat (org.apache.hadoop.hive.ql.io.AcidOutputFormat)11 Path (org.apache.hadoop.fs.Path)10 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)10 Configuration (org.apache.hadoop.conf.Configuration)9 FileSystem (org.apache.hadoop.fs.FileSystem)9 Test (org.junit.Test)9 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)8 InputSplit (org.apache.hadoop.mapred.InputSplit)6 JobConf (org.apache.hadoop.mapred.JobConf)5 NullWritable (org.apache.hadoop.io.NullWritable)4 InputFormat (org.apache.hadoop.mapred.InputFormat)4 RecordWriter (org.apache.hadoop.mapred.RecordWriter)2 MemoryManager (org.apache.orc.MemoryManager)2 MemoryManagerImpl (org.apache.orc.impl.MemoryManagerImpl)2 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1 DataInputStream (java.io.DataInputStream)1 IOException (java.io.IOException)1 PrintStream (java.io.PrintStream)1 Properties (java.util.Properties)1