Search in sources :

Example 6 with ValidWriteIdList

use of org.apache.hadoop.hive.common.ValidWriteIdList in project hive by apache.

the class AcidUtils method getAcidFilesForStats.

public static List<FileStatus> getAcidFilesForStats(Table table, Path dir, Configuration jc, FileSystem fs) throws IOException {
    List<FileStatus> fileList = new ArrayList<>();
    ValidWriteIdList idList = AcidUtils.getTableValidWriteIdList(jc, AcidUtils.getFullTableName(table.getDbName(), table.getTableName()));
    if (idList == null) {
        LOG.warn("Cannot get ACID state for " + table.getDbName() + "." + table.getTableName() + " from " + jc.get(ValidTxnWriteIdList.VALID_TABLES_WRITEIDS_KEY));
        return null;
    }
    Directory acidInfo = AcidUtils.getAcidState(dir, jc, idList);
    // Assume that for an MM table, or if there's only the base directory, we are good.
    if (!acidInfo.getCurrentDirectories().isEmpty() && AcidUtils.isFullAcidTable(table)) {
        Utilities.FILE_OP_LOGGER.warn("Computing stats for an ACID table; stats may be inaccurate");
    }
    if (fs == null) {
        fs = dir.getFileSystem(jc);
    }
    for (HdfsFileStatusWithId hfs : acidInfo.getOriginalFiles()) {
        fileList.add(hfs.getFileStatus());
    }
    for (ParsedDelta delta : acidInfo.getCurrentDirectories()) {
        for (FileStatus f : HiveStatsUtils.getFileStatusRecurse(delta.getPath(), -1, fs)) {
            fileList.add(f);
        }
    }
    if (acidInfo.getBaseDirectory() != null) {
        for (FileStatus f : HiveStatsUtils.getFileStatusRecurse(acidInfo.getBaseDirectory(), -1, fs)) {
            fileList.add(f);
        }
    }
    return fileList;
}
Also used : FileStatus(org.apache.hadoop.fs.FileStatus) ValidWriteIdList(org.apache.hadoop.hive.common.ValidWriteIdList) HdfsFileStatusWithId(org.apache.hadoop.hive.shims.HadoopShims.HdfsFileStatusWithId) ArrayList(java.util.ArrayList)

Example 7 with ValidWriteIdList

use of org.apache.hadoop.hive.common.ValidWriteIdList in project hive by apache.

the class OrcInputFormat method getReader.

@Override
public RowReader<OrcStruct> getReader(InputSplit inputSplit, Options options) throws IOException {
    final OrcSplit split = (OrcSplit) inputSplit;
    // Retrieve the acidOperationalProperties for the table, initialized in HiveInputFormat.
    AcidUtils.AcidOperationalProperties acidOperationalProperties = AcidUtils.getAcidOperationalProperties(options.getConfiguration());
    if (!acidOperationalProperties.isSplitUpdate()) {
        throw new IllegalStateException("Expected SpliUpdate table: " + split.getPath());
    }
    final Path[] deltas = VectorizedOrcAcidRowBatchReader.getDeleteDeltaDirsFromSplit(split);
    final Configuration conf = options.getConfiguration();
    final Reader reader = OrcInputFormat.createOrcReaderForSplit(conf, split);
    OrcRawRecordMerger.Options mergerOptions = new OrcRawRecordMerger.Options().isCompacting(false);
    mergerOptions.rootPath(split.getRootDir());
    mergerOptions.bucketPath(split.getPath());
    final int bucket;
    if (split.hasBase()) {
        AcidOutputFormat.Options acidIOOptions = AcidUtils.parseBaseOrDeltaBucketFilename(split.getPath(), conf);
        if (acidIOOptions.getBucketId() < 0) {
            LOG.warn("Can't determine bucket ID for " + split.getPath() + "; ignoring");
        }
        bucket = acidIOOptions.getBucketId();
        if (split.isOriginal()) {
            mergerOptions.copyIndex(acidIOOptions.getCopyNumber()).bucketPath(split.getPath());
        }
    } else {
        bucket = (int) split.getStart();
        assert false : "We should never have a split w/o base in acid 2.0 for full acid: " + split.getPath();
    }
    // todo: createOptionsForReader() assumes it's !isOriginal.... why?
    final Reader.Options readOptions = OrcInputFormat.createOptionsForReader(conf);
    readOptions.range(split.getStart(), split.getLength());
    String txnString = conf.get(ValidWriteIdList.VALID_WRITEIDS_KEY);
    ValidWriteIdList validWriteIdList = (txnString == null) ? new ValidReaderWriteIdList() : new ValidReaderWriteIdList(txnString);
    LOG.debug("getReader:: Read ValidWriteIdList: " + validWriteIdList.toString() + " isTransactionalTable: " + HiveConf.getBoolVar(conf, ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN));
    final OrcRawRecordMerger records = new OrcRawRecordMerger(conf, true, reader, split.isOriginal(), bucket, validWriteIdList, readOptions, deltas, mergerOptions);
    return new RowReader<OrcStruct>() {

        OrcStruct innerRecord = records.createValue();

        @Override
        public ObjectInspector getObjectInspector() {
            return OrcStruct.createObjectInspector(0, OrcUtils.getOrcTypes(readOptions.getSchema()));
        }

        @Override
        public boolean next(RecordIdentifier recordIdentifier, OrcStruct orcStruct) throws IOException {
            boolean result;
            // filter out the deleted records
            do {
                result = records.next(recordIdentifier, innerRecord);
            } while (result && OrcRecordUpdater.getOperation(innerRecord) == OrcRecordUpdater.DELETE_OPERATION);
            if (result) {
                // swap the fields with the passed in orcStruct
                orcStruct.linkFields(OrcRecordUpdater.getRow(innerRecord));
            }
            return result;
        }

        @Override
        public RecordIdentifier createKey() {
            return records.createKey();
        }

        @Override
        public OrcStruct createValue() {
            return new OrcStruct(records.getColumns());
        }

        @Override
        public long getPos() throws IOException {
            return records.getPos();
        }

        @Override
        public void close() throws IOException {
            records.close();
        }

        @Override
        public float getProgress() throws IOException {
            return records.getProgress();
        }
    };
}
Also used : Path(org.apache.hadoop.fs.Path) BatchToRowReader(org.apache.hadoop.hive.ql.io.BatchToRowReader) Configuration(org.apache.hadoop.conf.Configuration) StatsProvidingRecordReader(org.apache.hadoop.hive.ql.io.StatsProvidingRecordReader) BatchToRowReader(org.apache.hadoop.hive.ql.io.BatchToRowReader) AcidOutputFormat(org.apache.hadoop.hive.ql.io.AcidOutputFormat) RecordIdentifier(org.apache.hadoop.hive.ql.io.RecordIdentifier) ValidWriteIdList(org.apache.hadoop.hive.common.ValidWriteIdList) ValidReaderWriteIdList(org.apache.hadoop.hive.common.ValidReaderWriteIdList) AcidOperationalProperties(org.apache.hadoop.hive.ql.io.AcidUtils.AcidOperationalProperties) AcidUtils(org.apache.hadoop.hive.ql.io.AcidUtils)

Example 8 with ValidWriteIdList

use of org.apache.hadoop.hive.common.ValidWriteIdList in project hive by apache.

the class TestStreaming method checkDataWritten.

/**
 * @deprecated use {@link #checkDataWritten2(Path, long, long, int, String, boolean, String...)} -
 * there is little value in using InputFormat directly
 */
@Deprecated
private void checkDataWritten(Path partitionPath, long minTxn, long maxTxn, int buckets, int numExpectedFiles, String... records) throws Exception {
    ValidWriteIdList writeIds = msClient.getValidWriteIds(AcidUtils.getFullTableName(dbName, tblName));
    AcidUtils.Directory dir = AcidUtils.getAcidState(partitionPath, conf, writeIds);
    Assert.assertEquals(0, dir.getObsolete().size());
    Assert.assertEquals(0, dir.getOriginalFiles().size());
    List<AcidUtils.ParsedDelta> current = dir.getCurrentDirectories();
    System.out.println("Files found: ");
    for (AcidUtils.ParsedDelta pd : current) {
        System.out.println(pd.getPath().toString());
    }
    Assert.assertEquals(numExpectedFiles, current.size());
    // find the absolute minimum transaction
    long min = Long.MAX_VALUE;
    long max = Long.MIN_VALUE;
    for (AcidUtils.ParsedDelta pd : current) {
        if (pd.getMaxWriteId() > max) {
            max = pd.getMaxWriteId();
        }
        if (pd.getMinWriteId() < min) {
            min = pd.getMinWriteId();
        }
    }
    Assert.assertEquals(minTxn, min);
    Assert.assertEquals(maxTxn, max);
    InputFormat inf = new OrcInputFormat();
    JobConf job = new JobConf();
    job.set("mapred.input.dir", partitionPath.toString());
    job.set(BUCKET_COUNT, Integer.toString(buckets));
    job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, "id,msg");
    job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, "bigint:string");
    AcidUtils.setAcidOperationalProperties(job, true, null);
    job.setBoolean(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL, true);
    job.set(ValidWriteIdList.VALID_WRITEIDS_KEY, writeIds.toString());
    InputSplit[] splits = inf.getSplits(job, buckets);
    Assert.assertEquals(numExpectedFiles, splits.length);
    org.apache.hadoop.mapred.RecordReader<NullWritable, OrcStruct> rr = inf.getRecordReader(splits[0], job, Reporter.NULL);
    NullWritable key = rr.createKey();
    OrcStruct value = rr.createValue();
    for (String record : records) {
        Assert.assertEquals(true, rr.next(key, value));
        Assert.assertEquals(record, value.toString());
    }
    Assert.assertEquals(false, rr.next(key, value));
}
Also used : NullWritable(org.apache.hadoop.io.NullWritable) OrcStruct(org.apache.hadoop.hive.ql.io.orc.OrcStruct) OrcInputFormat(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat) ValidWriteIdList(org.apache.hadoop.hive.common.ValidWriteIdList) InputFormat(org.apache.hadoop.mapred.InputFormat) OrcInputFormat(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) OrcAcidUtils(org.apache.orc.impl.OrcAcidUtils) AcidUtils(org.apache.hadoop.hive.ql.io.AcidUtils)

Example 9 with ValidWriteIdList

use of org.apache.hadoop.hive.common.ValidWriteIdList in project hive by apache.

the class TestStreaming method checkNothingWritten.

private void checkNothingWritten(Path partitionPath) throws Exception {
    ValidWriteIdList writeIds = msClient.getValidWriteIds(AcidUtils.getFullTableName(dbName, tblName));
    AcidUtils.Directory dir = AcidUtils.getAcidState(partitionPath, conf, writeIds);
    Assert.assertEquals(0, dir.getObsolete().size());
    Assert.assertEquals(0, dir.getOriginalFiles().size());
    List<AcidUtils.ParsedDelta> current = dir.getCurrentDirectories();
    Assert.assertEquals(0, current.size());
}
Also used : ValidWriteIdList(org.apache.hadoop.hive.common.ValidWriteIdList) OrcAcidUtils(org.apache.orc.impl.OrcAcidUtils) AcidUtils(org.apache.hadoop.hive.ql.io.AcidUtils)

Example 10 with ValidWriteIdList

use of org.apache.hadoop.hive.common.ValidWriteIdList in project hive by apache.

the class Cleaner method clean.

private void clean(CompactionInfo ci) throws MetaException {
    LOG.info("Starting cleaning for " + ci.getFullPartitionName());
    try {
        Table t = resolveTable(ci);
        if (t == null) {
            // The table was dropped before we got around to cleaning it.
            LOG.info("Unable to find table " + ci.getFullTableName() + ", assuming it was dropped");
            txnHandler.markCleaned(ci);
            return;
        }
        Partition p = null;
        if (ci.partName != null) {
            p = resolvePartition(ci);
            if (p == null) {
                // The partition was dropped before we got around to cleaning it.
                LOG.info("Unable to find partition " + ci.getFullPartitionName() + ", assuming it was dropped");
                txnHandler.markCleaned(ci);
                return;
            }
        }
        StorageDescriptor sd = resolveStorageDescriptor(t, p);
        final String location = sd.getLocation();
        /**
         * Each Compaction only compacts as far as the highest txn id such that all txns below it
         * are resolved (i.e. not opened).  This is what "highestWriteId" tracks.  This is only tracked
         * since Hive 1.3.0/2.0 - thus may be 0.  See ValidCompactorWriteIdList and uses for more info.
         *
         * We only want to clean up to the highestWriteId - otherwise we risk deleting deltas from
         * under an active reader.
         *
         * Suppose we have deltas D2 D3 for table T, i.e. the last compaction created D3 so now there is a
         * clean request for D2.
         * Cleaner checks existing locks and finds none.
         * Between that check and removeFiles() a query starts (it will be reading D3) and another compaction
         * completes which creates D4.
         * Now removeFiles() (more specifically AcidUtils.getAcidState()) will declare D3 to be obsolete
         * unless ValidTxnList is "capped" at highestWriteId.
         */
        final ValidWriteIdList txnList = (ci.highestWriteId > 0) ? new ValidReaderWriteIdList(ci.getFullTableName(), new long[0], new BitSet(), ci.highestWriteId) : new ValidReaderWriteIdList();
        if (runJobAsSelf(ci.runAs)) {
            removeFiles(location, txnList);
        } else {
            LOG.info("Cleaning as user " + ci.runAs + " for " + ci.getFullPartitionName());
            UserGroupInformation ugi = UserGroupInformation.createProxyUser(ci.runAs, UserGroupInformation.getLoginUser());
            ugi.doAs(new PrivilegedExceptionAction<Object>() {

                @Override
                public Object run() throws Exception {
                    removeFiles(location, txnList);
                    return null;
                }
            });
            try {
                FileSystem.closeAllForUGI(ugi);
            } catch (IOException exception) {
                LOG.error("Could not clean up file-system handles for UGI: " + ugi + " for " + ci.getFullPartitionName(), exception);
            }
        }
        txnHandler.markCleaned(ci);
    } catch (Exception e) {
        LOG.error("Caught exception when cleaning, unable to complete cleaning of " + ci + " " + StringUtils.stringifyException(e));
        txnHandler.markFailed(ci);
    }
}
Also used : Partition(org.apache.hadoop.hive.metastore.api.Partition) Table(org.apache.hadoop.hive.metastore.api.Table) StorageDescriptor(org.apache.hadoop.hive.metastore.api.StorageDescriptor) BitSet(java.util.BitSet) IOException(java.io.IOException) MetaException(org.apache.hadoop.hive.metastore.api.MetaException) IOException(java.io.IOException) ValidWriteIdList(org.apache.hadoop.hive.common.ValidWriteIdList) ValidReaderWriteIdList(org.apache.hadoop.hive.common.ValidReaderWriteIdList) UserGroupInformation(org.apache.hadoop.security.UserGroupInformation)

Aggregations

ValidWriteIdList (org.apache.hadoop.hive.common.ValidWriteIdList)16 AcidUtils (org.apache.hadoop.hive.ql.io.AcidUtils)6 Path (org.apache.hadoop.fs.Path)5 OrcAcidUtils (org.apache.orc.impl.OrcAcidUtils)5 Configuration (org.apache.hadoop.conf.Configuration)4 ValidReaderWriteIdList (org.apache.hadoop.hive.common.ValidReaderWriteIdList)4 RecordIdentifier (org.apache.hadoop.hive.ql.io.RecordIdentifier)4 IOException (java.io.IOException)3 Partition (org.apache.hadoop.hive.metastore.api.Partition)3 StorageDescriptor (org.apache.hadoop.hive.metastore.api.StorageDescriptor)3 Table (org.apache.hadoop.hive.metastore.api.Table)3 AcidOutputFormat (org.apache.hadoop.hive.ql.io.AcidOutputFormat)3 FileSystem (org.apache.hadoop.fs.FileSystem)2 GetValidWriteIdsRequest (org.apache.hadoop.hive.metastore.api.GetValidWriteIdsRequest)2 MetaException (org.apache.hadoop.hive.metastore.api.MetaException)2 CompactionInfo (org.apache.hadoop.hive.metastore.txn.CompactionInfo)2 OrcInputFormat (org.apache.hadoop.hive.ql.io.orc.OrcInputFormat)2 OrcStruct (org.apache.hadoop.hive.ql.io.orc.OrcStruct)2 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)2 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)2