Search in sources :

Example 11 with ValidReaderWriteIdList

use of org.apache.hadoop.hive.common.ValidReaderWriteIdList in project hive by apache.

the class TestOrcRawRecordMerger method testOriginalReaderPair.

@Test
public void testOriginalReaderPair() throws Exception {
    int BUCKET = 10;
    ReaderKey key = new ReaderKey();
    Configuration conf = new Configuration();
    int bucketProperty = OrcRawRecordMerger.encodeBucketId(conf, BUCKET, 0);
    Reader reader = createMockOriginalReader();
    RecordIdentifier minKey = new RecordIdentifier(0, bucketProperty, 1);
    RecordIdentifier maxKey = new RecordIdentifier(0, bucketProperty, 3);
    boolean[] includes = new boolean[] { true, true };
    FileSystem fs = FileSystem.getLocal(conf);
    Path root = new Path(tmpDir, "testOriginalReaderPair");
    fs.makeQualified(root);
    fs.create(root);
    ReaderPair pair = new OrcRawRecordMerger.OriginalReaderPairToRead(key, reader, BUCKET, minKey, maxKey, new Reader.Options().include(includes), new OrcRawRecordMerger.Options().rootPath(root), conf, new ValidReaderWriteIdList(), 0);
    RecordReader recordReader = pair.getRecordReader();
    assertEquals(0, key.getWriteId());
    assertEquals(bucketProperty, key.getBucketProperty());
    assertEquals(2, key.getRowId());
    assertEquals(0, key.getCurrentWriteId());
    assertEquals("third", value(pair.nextRecord()));
    pair.next(pair.nextRecord());
    assertEquals(0, key.getWriteId());
    assertEquals(bucketProperty, key.getBucketProperty());
    assertEquals(3, key.getRowId());
    assertEquals(0, key.getCurrentWriteId());
    assertEquals("fourth", value(pair.nextRecord()));
    pair.next(pair.nextRecord());
    assertEquals(null, pair.nextRecord());
    Mockito.verify(recordReader).close();
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) RecordIdentifier(org.apache.hadoop.hive.ql.io.RecordIdentifier) ReaderPair(org.apache.hadoop.hive.ql.io.orc.OrcRawRecordMerger.ReaderPair) FileSystem(org.apache.hadoop.fs.FileSystem) ReaderKey(org.apache.hadoop.hive.ql.io.orc.OrcRawRecordMerger.ReaderKey) ValidReaderWriteIdList(org.apache.hadoop.hive.common.ValidReaderWriteIdList) Test(org.junit.Test)

Example 12 with ValidReaderWriteIdList

use of org.apache.hadoop.hive.common.ValidReaderWriteIdList in project hive by apache.

the class TestOrcRawRecordMerger method testOriginalReaderPairNoMin.

@Test
public void testOriginalReaderPairNoMin() throws Exception {
    int BUCKET = 10;
    ReaderKey key = new ReaderKey();
    Reader reader = createMockOriginalReader();
    Configuration conf = new Configuration();
    int bucketProperty = OrcRawRecordMerger.encodeBucketId(conf, BUCKET, 0);
    FileSystem fs = FileSystem.getLocal(conf);
    Path root = new Path(tmpDir, "testOriginalReaderPairNoMin");
    fs.makeQualified(root);
    fs.create(root);
    ReaderPair pair = new OrcRawRecordMerger.OriginalReaderPairToRead(key, reader, BUCKET, null, null, new Reader.Options(), new OrcRawRecordMerger.Options().rootPath(root), conf, new ValidReaderWriteIdList(), 0);
    assertEquals("first", value(pair.nextRecord()));
    assertEquals(0, key.getWriteId());
    assertEquals(bucketProperty, key.getBucketProperty());
    assertEquals(0, key.getRowId());
    assertEquals(0, key.getCurrentWriteId());
    pair.next(pair.nextRecord());
    assertEquals("second", value(pair.nextRecord()));
    assertEquals(0, key.getWriteId());
    assertEquals(bucketProperty, key.getBucketProperty());
    assertEquals(1, key.getRowId());
    assertEquals(0, key.getCurrentWriteId());
    pair.next(pair.nextRecord());
    assertEquals("third", value(pair.nextRecord()));
    assertEquals(0, key.getWriteId());
    assertEquals(bucketProperty, key.getBucketProperty());
    assertEquals(2, key.getRowId());
    assertEquals(0, key.getCurrentWriteId());
    pair.next(pair.nextRecord());
    assertEquals("fourth", value(pair.nextRecord()));
    assertEquals(0, key.getWriteId());
    assertEquals(bucketProperty, key.getBucketProperty());
    assertEquals(3, key.getRowId());
    assertEquals(0, key.getCurrentWriteId());
    pair.next(pair.nextRecord());
    assertEquals("fifth", value(pair.nextRecord()));
    assertEquals(0, key.getWriteId());
    assertEquals(bucketProperty, key.getBucketProperty());
    assertEquals(4, key.getRowId());
    assertEquals(0, key.getCurrentWriteId());
    pair.next(pair.nextRecord());
    assertEquals(null, pair.nextRecord());
    Mockito.verify(pair.getRecordReader()).close();
}
Also used : Path(org.apache.hadoop.fs.Path) ReaderPair(org.apache.hadoop.hive.ql.io.orc.OrcRawRecordMerger.ReaderPair) Configuration(org.apache.hadoop.conf.Configuration) FileSystem(org.apache.hadoop.fs.FileSystem) ReaderKey(org.apache.hadoop.hive.ql.io.orc.OrcRawRecordMerger.ReaderKey) ValidReaderWriteIdList(org.apache.hadoop.hive.common.ValidReaderWriteIdList) Test(org.junit.Test)

Example 13 with ValidReaderWriteIdList

use of org.apache.hadoop.hive.common.ValidReaderWriteIdList in project hive by apache.

the class OrcInputFormat method getReader.

@Override
public RowReader<OrcStruct> getReader(InputSplit inputSplit, Options options) throws IOException {
    final OrcSplit split = (OrcSplit) inputSplit;
    // Retrieve the acidOperationalProperties for the table, initialized in HiveInputFormat.
    AcidUtils.AcidOperationalProperties acidOperationalProperties = AcidUtils.getAcidOperationalProperties(options.getConfiguration());
    if (!acidOperationalProperties.isSplitUpdate()) {
        throw new IllegalStateException("Expected SpliUpdate table: " + split.getPath());
    }
    final Path[] deltas = VectorizedOrcAcidRowBatchReader.getDeleteDeltaDirsFromSplit(split);
    final Configuration conf = options.getConfiguration();
    final Reader reader = OrcInputFormat.createOrcReaderForSplit(conf, split);
    OrcRawRecordMerger.Options mergerOptions = new OrcRawRecordMerger.Options().isCompacting(false);
    mergerOptions.rootPath(split.getRootDir());
    mergerOptions.bucketPath(split.getPath());
    final int bucket;
    if (split.hasBase()) {
        AcidOutputFormat.Options acidIOOptions = AcidUtils.parseBaseOrDeltaBucketFilename(split.getPath(), conf);
        if (acidIOOptions.getBucketId() < 0) {
            LOG.warn("Can't determine bucket ID for " + split.getPath() + "; ignoring");
        }
        bucket = acidIOOptions.getBucketId();
        if (split.isOriginal()) {
            mergerOptions.copyIndex(acidIOOptions.getCopyNumber()).bucketPath(split.getPath());
        }
    } else {
        bucket = (int) split.getStart();
        assert false : "We should never have a split w/o base in acid 2.0 for full acid: " + split.getPath();
    }
    // todo: createOptionsForReader() assumes it's !isOriginal.... why?
    final Reader.Options readOptions = OrcInputFormat.createOptionsForReader(conf);
    readOptions.range(split.getStart(), split.getLength());
    String txnString = conf.get(ValidWriteIdList.VALID_WRITEIDS_KEY);
    ValidWriteIdList validWriteIdList = (txnString == null) ? new ValidReaderWriteIdList() : new ValidReaderWriteIdList(txnString);
    LOG.debug("getReader:: Read ValidWriteIdList: " + validWriteIdList.toString() + " isTransactionalTable: " + HiveConf.getBoolVar(conf, ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN));
    final OrcRawRecordMerger records = new OrcRawRecordMerger(conf, true, reader, split.isOriginal(), bucket, validWriteIdList, readOptions, deltas, mergerOptions);
    return new RowReader<OrcStruct>() {

        OrcStruct innerRecord = records.createValue();

        @Override
        public ObjectInspector getObjectInspector() {
            return OrcStruct.createObjectInspector(0, OrcUtils.getOrcTypes(readOptions.getSchema()));
        }

        @Override
        public boolean next(RecordIdentifier recordIdentifier, OrcStruct orcStruct) throws IOException {
            boolean result;
            // filter out the deleted records
            do {
                result = records.next(recordIdentifier, innerRecord);
            } while (result && OrcRecordUpdater.getOperation(innerRecord) == OrcRecordUpdater.DELETE_OPERATION);
            if (result) {
                // swap the fields with the passed in orcStruct
                orcStruct.linkFields(OrcRecordUpdater.getRow(innerRecord));
            }
            return result;
        }

        @Override
        public RecordIdentifier createKey() {
            return records.createKey();
        }

        @Override
        public OrcStruct createValue() {
            return new OrcStruct(records.getColumns());
        }

        @Override
        public long getPos() throws IOException {
            return records.getPos();
        }

        @Override
        public void close() throws IOException {
            records.close();
        }

        @Override
        public float getProgress() throws IOException {
            return records.getProgress();
        }
    };
}
Also used : Path(org.apache.hadoop.fs.Path) BatchToRowReader(org.apache.hadoop.hive.ql.io.BatchToRowReader) Configuration(org.apache.hadoop.conf.Configuration) StatsProvidingRecordReader(org.apache.hadoop.hive.ql.io.StatsProvidingRecordReader) BatchToRowReader(org.apache.hadoop.hive.ql.io.BatchToRowReader) AcidOutputFormat(org.apache.hadoop.hive.ql.io.AcidOutputFormat) RecordIdentifier(org.apache.hadoop.hive.ql.io.RecordIdentifier) ValidWriteIdList(org.apache.hadoop.hive.common.ValidWriteIdList) ValidReaderWriteIdList(org.apache.hadoop.hive.common.ValidReaderWriteIdList) AcidOperationalProperties(org.apache.hadoop.hive.ql.io.AcidUtils.AcidOperationalProperties) AcidUtils(org.apache.hadoop.hive.ql.io.AcidUtils)

Example 14 with ValidReaderWriteIdList

use of org.apache.hadoop.hive.common.ValidReaderWriteIdList in project hive by apache.

the class Cleaner method clean.

private void clean(CompactionInfo ci) throws MetaException {
    LOG.info("Starting cleaning for " + ci.getFullPartitionName());
    try {
        Table t = resolveTable(ci);
        if (t == null) {
            // The table was dropped before we got around to cleaning it.
            LOG.info("Unable to find table " + ci.getFullTableName() + ", assuming it was dropped");
            txnHandler.markCleaned(ci);
            return;
        }
        Partition p = null;
        if (ci.partName != null) {
            p = resolvePartition(ci);
            if (p == null) {
                // The partition was dropped before we got around to cleaning it.
                LOG.info("Unable to find partition " + ci.getFullPartitionName() + ", assuming it was dropped");
                txnHandler.markCleaned(ci);
                return;
            }
        }
        StorageDescriptor sd = resolveStorageDescriptor(t, p);
        final String location = sd.getLocation();
        /**
         * Each Compaction only compacts as far as the highest txn id such that all txns below it
         * are resolved (i.e. not opened).  This is what "highestWriteId" tracks.  This is only tracked
         * since Hive 1.3.0/2.0 - thus may be 0.  See ValidCompactorWriteIdList and uses for more info.
         *
         * We only want to clean up to the highestWriteId - otherwise we risk deleting deltas from
         * under an active reader.
         *
         * Suppose we have deltas D2 D3 for table T, i.e. the last compaction created D3 so now there is a
         * clean request for D2.
         * Cleaner checks existing locks and finds none.
         * Between that check and removeFiles() a query starts (it will be reading D3) and another compaction
         * completes which creates D4.
         * Now removeFiles() (more specifically AcidUtils.getAcidState()) will declare D3 to be obsolete
         * unless ValidTxnList is "capped" at highestWriteId.
         */
        final ValidWriteIdList txnList = (ci.highestWriteId > 0) ? new ValidReaderWriteIdList(ci.getFullTableName(), new long[0], new BitSet(), ci.highestWriteId) : new ValidReaderWriteIdList();
        if (runJobAsSelf(ci.runAs)) {
            removeFiles(location, txnList);
        } else {
            LOG.info("Cleaning as user " + ci.runAs + " for " + ci.getFullPartitionName());
            UserGroupInformation ugi = UserGroupInformation.createProxyUser(ci.runAs, UserGroupInformation.getLoginUser());
            ugi.doAs(new PrivilegedExceptionAction<Object>() {

                @Override
                public Object run() throws Exception {
                    removeFiles(location, txnList);
                    return null;
                }
            });
            try {
                FileSystem.closeAllForUGI(ugi);
            } catch (IOException exception) {
                LOG.error("Could not clean up file-system handles for UGI: " + ugi + " for " + ci.getFullPartitionName(), exception);
            }
        }
        txnHandler.markCleaned(ci);
    } catch (Exception e) {
        LOG.error("Caught exception when cleaning, unable to complete cleaning of " + ci + " " + StringUtils.stringifyException(e));
        txnHandler.markFailed(ci);
    }
}
Also used : Partition(org.apache.hadoop.hive.metastore.api.Partition) Table(org.apache.hadoop.hive.metastore.api.Table) StorageDescriptor(org.apache.hadoop.hive.metastore.api.StorageDescriptor) BitSet(java.util.BitSet) IOException(java.io.IOException) MetaException(org.apache.hadoop.hive.metastore.api.MetaException) IOException(java.io.IOException) ValidWriteIdList(org.apache.hadoop.hive.common.ValidWriteIdList) ValidReaderWriteIdList(org.apache.hadoop.hive.common.ValidReaderWriteIdList) UserGroupInformation(org.apache.hadoop.security.UserGroupInformation)

Example 15 with ValidReaderWriteIdList

use of org.apache.hadoop.hive.common.ValidReaderWriteIdList in project hive by apache.

the class FetchOperator method extractValidWriteIdList.

private ValidWriteIdList extractValidWriteIdList() {
    if (currDesc.getTableName() == null || !org.apache.commons.lang.StringUtils.isBlank(currDesc.getTableName())) {
        String txnString = job.get(ValidWriteIdList.VALID_WRITEIDS_KEY);
        LOG.debug("FetchOperator get writeIdStr: " + txnString);
        return txnString == null ? new ValidReaderWriteIdList() : new ValidReaderWriteIdList(txnString);
    }
    // not fetching from a table directly but from a temp location
    return null;
}
Also used : ValidReaderWriteIdList(org.apache.hadoop.hive.common.ValidReaderWriteIdList)

Aggregations

ValidReaderWriteIdList (org.apache.hadoop.hive.common.ValidReaderWriteIdList)20 Configuration (org.apache.hadoop.conf.Configuration)17 Test (org.junit.Test)15 Path (org.apache.hadoop.fs.Path)13 MockFile (org.apache.hadoop.hive.ql.io.orc.TestInputOutputFormat.MockFile)12 MockFileSystem (org.apache.hadoop.hive.ql.io.orc.TestInputOutputFormat.MockFileSystem)12 MockPath (org.apache.hadoop.hive.ql.io.orc.TestInputOutputFormat.MockPath)12 FileStatus (org.apache.hadoop.fs.FileStatus)8 FileSystem (org.apache.hadoop.fs.FileSystem)4 ValidWriteIdList (org.apache.hadoop.hive.common.ValidWriteIdList)4 RecordIdentifier (org.apache.hadoop.hive.ql.io.RecordIdentifier)4 AcidOutputFormat (org.apache.hadoop.hive.ql.io.AcidOutputFormat)3 AcidUtils (org.apache.hadoop.hive.ql.io.AcidUtils)3 ReaderKey (org.apache.hadoop.hive.ql.io.orc.OrcRawRecordMerger.ReaderKey)3 BitSet (java.util.BitSet)2 ReaderPair (org.apache.hadoop.hive.ql.io.orc.OrcRawRecordMerger.ReaderPair)2 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)2 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)2 HdfsFileStatusWithId (org.apache.hadoop.hive.shims.HadoopShims.HdfsFileStatusWithId)2 OrcAcidUtils (org.apache.orc.impl.OrcAcidUtils)2