use of org.apache.hadoop.hive.common.ValidWriteIdList in project hive by apache.
the class AcidUtils method getAcidFilesForStats.
public static List<FileStatus> getAcidFilesForStats(Table table, Path dir, Configuration jc, FileSystem fs) throws IOException {
List<FileStatus> fileList = new ArrayList<>();
ValidWriteIdList idList = AcidUtils.getTableValidWriteIdList(jc, AcidUtils.getFullTableName(table.getDbName(), table.getTableName()));
if (idList == null) {
LOG.warn("Cannot get ACID state for " + table.getDbName() + "." + table.getTableName() + " from " + jc.get(ValidTxnWriteIdList.VALID_TABLES_WRITEIDS_KEY));
return null;
}
Directory acidInfo = AcidUtils.getAcidState(dir, jc, idList);
// Assume that for an MM table, or if there's only the base directory, we are good.
if (!acidInfo.getCurrentDirectories().isEmpty() && AcidUtils.isFullAcidTable(table)) {
Utilities.FILE_OP_LOGGER.warn("Computing stats for an ACID table; stats may be inaccurate");
}
if (fs == null) {
fs = dir.getFileSystem(jc);
}
for (HdfsFileStatusWithId hfs : acidInfo.getOriginalFiles()) {
fileList.add(hfs.getFileStatus());
}
for (ParsedDelta delta : acidInfo.getCurrentDirectories()) {
for (FileStatus f : HiveStatsUtils.getFileStatusRecurse(delta.getPath(), -1, fs)) {
fileList.add(f);
}
}
if (acidInfo.getBaseDirectory() != null) {
for (FileStatus f : HiveStatsUtils.getFileStatusRecurse(acidInfo.getBaseDirectory(), -1, fs)) {
fileList.add(f);
}
}
return fileList;
}
use of org.apache.hadoop.hive.common.ValidWriteIdList in project hive by apache.
the class OrcInputFormat method getReader.
@Override
public RowReader<OrcStruct> getReader(InputSplit inputSplit, Options options) throws IOException {
final OrcSplit split = (OrcSplit) inputSplit;
// Retrieve the acidOperationalProperties for the table, initialized in HiveInputFormat.
AcidUtils.AcidOperationalProperties acidOperationalProperties = AcidUtils.getAcidOperationalProperties(options.getConfiguration());
if (!acidOperationalProperties.isSplitUpdate()) {
throw new IllegalStateException("Expected SpliUpdate table: " + split.getPath());
}
final Path[] deltas = VectorizedOrcAcidRowBatchReader.getDeleteDeltaDirsFromSplit(split);
final Configuration conf = options.getConfiguration();
final Reader reader = OrcInputFormat.createOrcReaderForSplit(conf, split);
OrcRawRecordMerger.Options mergerOptions = new OrcRawRecordMerger.Options().isCompacting(false);
mergerOptions.rootPath(split.getRootDir());
mergerOptions.bucketPath(split.getPath());
final int bucket;
if (split.hasBase()) {
AcidOutputFormat.Options acidIOOptions = AcidUtils.parseBaseOrDeltaBucketFilename(split.getPath(), conf);
if (acidIOOptions.getBucketId() < 0) {
LOG.warn("Can't determine bucket ID for " + split.getPath() + "; ignoring");
}
bucket = acidIOOptions.getBucketId();
if (split.isOriginal()) {
mergerOptions.copyIndex(acidIOOptions.getCopyNumber()).bucketPath(split.getPath());
}
} else {
bucket = (int) split.getStart();
assert false : "We should never have a split w/o base in acid 2.0 for full acid: " + split.getPath();
}
// todo: createOptionsForReader() assumes it's !isOriginal.... why?
final Reader.Options readOptions = OrcInputFormat.createOptionsForReader(conf);
readOptions.range(split.getStart(), split.getLength());
String txnString = conf.get(ValidWriteIdList.VALID_WRITEIDS_KEY);
ValidWriteIdList validWriteIdList = (txnString == null) ? new ValidReaderWriteIdList() : new ValidReaderWriteIdList(txnString);
LOG.debug("getReader:: Read ValidWriteIdList: " + validWriteIdList.toString() + " isTransactionalTable: " + HiveConf.getBoolVar(conf, ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN));
final OrcRawRecordMerger records = new OrcRawRecordMerger(conf, true, reader, split.isOriginal(), bucket, validWriteIdList, readOptions, deltas, mergerOptions);
return new RowReader<OrcStruct>() {
OrcStruct innerRecord = records.createValue();
@Override
public ObjectInspector getObjectInspector() {
return OrcStruct.createObjectInspector(0, OrcUtils.getOrcTypes(readOptions.getSchema()));
}
@Override
public boolean next(RecordIdentifier recordIdentifier, OrcStruct orcStruct) throws IOException {
boolean result;
// filter out the deleted records
do {
result = records.next(recordIdentifier, innerRecord);
} while (result && OrcRecordUpdater.getOperation(innerRecord) == OrcRecordUpdater.DELETE_OPERATION);
if (result) {
// swap the fields with the passed in orcStruct
orcStruct.linkFields(OrcRecordUpdater.getRow(innerRecord));
}
return result;
}
@Override
public RecordIdentifier createKey() {
return records.createKey();
}
@Override
public OrcStruct createValue() {
return new OrcStruct(records.getColumns());
}
@Override
public long getPos() throws IOException {
return records.getPos();
}
@Override
public void close() throws IOException {
records.close();
}
@Override
public float getProgress() throws IOException {
return records.getProgress();
}
};
}
use of org.apache.hadoop.hive.common.ValidWriteIdList in project hive by apache.
the class TestStreaming method checkDataWritten.
/**
* @deprecated use {@link #checkDataWritten2(Path, long, long, int, String, boolean, String...)} -
* there is little value in using InputFormat directly
*/
@Deprecated
private void checkDataWritten(Path partitionPath, long minTxn, long maxTxn, int buckets, int numExpectedFiles, String... records) throws Exception {
ValidWriteIdList writeIds = msClient.getValidWriteIds(AcidUtils.getFullTableName(dbName, tblName));
AcidUtils.Directory dir = AcidUtils.getAcidState(partitionPath, conf, writeIds);
Assert.assertEquals(0, dir.getObsolete().size());
Assert.assertEquals(0, dir.getOriginalFiles().size());
List<AcidUtils.ParsedDelta> current = dir.getCurrentDirectories();
System.out.println("Files found: ");
for (AcidUtils.ParsedDelta pd : current) {
System.out.println(pd.getPath().toString());
}
Assert.assertEquals(numExpectedFiles, current.size());
// find the absolute minimum transaction
long min = Long.MAX_VALUE;
long max = Long.MIN_VALUE;
for (AcidUtils.ParsedDelta pd : current) {
if (pd.getMaxWriteId() > max) {
max = pd.getMaxWriteId();
}
if (pd.getMinWriteId() < min) {
min = pd.getMinWriteId();
}
}
Assert.assertEquals(minTxn, min);
Assert.assertEquals(maxTxn, max);
InputFormat inf = new OrcInputFormat();
JobConf job = new JobConf();
job.set("mapred.input.dir", partitionPath.toString());
job.set(BUCKET_COUNT, Integer.toString(buckets));
job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, "id,msg");
job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, "bigint:string");
AcidUtils.setAcidOperationalProperties(job, true, null);
job.setBoolean(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL, true);
job.set(ValidWriteIdList.VALID_WRITEIDS_KEY, writeIds.toString());
InputSplit[] splits = inf.getSplits(job, buckets);
Assert.assertEquals(numExpectedFiles, splits.length);
org.apache.hadoop.mapred.RecordReader<NullWritable, OrcStruct> rr = inf.getRecordReader(splits[0], job, Reporter.NULL);
NullWritable key = rr.createKey();
OrcStruct value = rr.createValue();
for (String record : records) {
Assert.assertEquals(true, rr.next(key, value));
Assert.assertEquals(record, value.toString());
}
Assert.assertEquals(false, rr.next(key, value));
}
use of org.apache.hadoop.hive.common.ValidWriteIdList in project hive by apache.
the class TestStreaming method checkNothingWritten.
private void checkNothingWritten(Path partitionPath) throws Exception {
ValidWriteIdList writeIds = msClient.getValidWriteIds(AcidUtils.getFullTableName(dbName, tblName));
AcidUtils.Directory dir = AcidUtils.getAcidState(partitionPath, conf, writeIds);
Assert.assertEquals(0, dir.getObsolete().size());
Assert.assertEquals(0, dir.getOriginalFiles().size());
List<AcidUtils.ParsedDelta> current = dir.getCurrentDirectories();
Assert.assertEquals(0, current.size());
}
use of org.apache.hadoop.hive.common.ValidWriteIdList in project hive by apache.
the class Cleaner method clean.
private void clean(CompactionInfo ci) throws MetaException {
LOG.info("Starting cleaning for " + ci.getFullPartitionName());
try {
Table t = resolveTable(ci);
if (t == null) {
// The table was dropped before we got around to cleaning it.
LOG.info("Unable to find table " + ci.getFullTableName() + ", assuming it was dropped");
txnHandler.markCleaned(ci);
return;
}
Partition p = null;
if (ci.partName != null) {
p = resolvePartition(ci);
if (p == null) {
// The partition was dropped before we got around to cleaning it.
LOG.info("Unable to find partition " + ci.getFullPartitionName() + ", assuming it was dropped");
txnHandler.markCleaned(ci);
return;
}
}
StorageDescriptor sd = resolveStorageDescriptor(t, p);
final String location = sd.getLocation();
/**
* Each Compaction only compacts as far as the highest txn id such that all txns below it
* are resolved (i.e. not opened). This is what "highestWriteId" tracks. This is only tracked
* since Hive 1.3.0/2.0 - thus may be 0. See ValidCompactorWriteIdList and uses for more info.
*
* We only want to clean up to the highestWriteId - otherwise we risk deleting deltas from
* under an active reader.
*
* Suppose we have deltas D2 D3 for table T, i.e. the last compaction created D3 so now there is a
* clean request for D2.
* Cleaner checks existing locks and finds none.
* Between that check and removeFiles() a query starts (it will be reading D3) and another compaction
* completes which creates D4.
* Now removeFiles() (more specifically AcidUtils.getAcidState()) will declare D3 to be obsolete
* unless ValidTxnList is "capped" at highestWriteId.
*/
final ValidWriteIdList txnList = (ci.highestWriteId > 0) ? new ValidReaderWriteIdList(ci.getFullTableName(), new long[0], new BitSet(), ci.highestWriteId) : new ValidReaderWriteIdList();
if (runJobAsSelf(ci.runAs)) {
removeFiles(location, txnList);
} else {
LOG.info("Cleaning as user " + ci.runAs + " for " + ci.getFullPartitionName());
UserGroupInformation ugi = UserGroupInformation.createProxyUser(ci.runAs, UserGroupInformation.getLoginUser());
ugi.doAs(new PrivilegedExceptionAction<Object>() {
@Override
public Object run() throws Exception {
removeFiles(location, txnList);
return null;
}
});
try {
FileSystem.closeAllForUGI(ugi);
} catch (IOException exception) {
LOG.error("Could not clean up file-system handles for UGI: " + ugi + " for " + ci.getFullPartitionName(), exception);
}
}
txnHandler.markCleaned(ci);
} catch (Exception e) {
LOG.error("Caught exception when cleaning, unable to complete cleaning of " + ci + " " + StringUtils.stringifyException(e));
txnHandler.markFailed(ci);
}
}
Aggregations