use of org.apache.hadoop.hive.common.io.CacheTag in project hive by apache.
the class VectorizedParquetRecordReader method initialize.
@SuppressWarnings("deprecation")
public void initialize(InputSplit oldSplit, JobConf configuration) throws IOException, InterruptedException, HiveException {
// the oldSplit may be null during the split phase
if (oldSplit == null) {
return;
}
ParquetMetadata footer;
List<BlockMetaData> blocks;
MapWork mapWork = LlapHiveUtils.findMapWork(jobConf);
if (mapWork != null) {
parts = mapWork.getPathToPartitionInfo();
}
ParquetInputSplit split = (ParquetInputSplit) oldSplit;
boolean indexAccess = configuration.getBoolean(DataWritableReadSupport.PARQUET_COLUMN_INDEX_ACCESS, false);
this.file = split.getPath();
long[] rowGroupOffsets = split.getRowGroupOffsets();
String columnNames = configuration.get(IOConstants.COLUMNS);
columnNamesList = DataWritableReadSupport.getColumnNames(columnNames);
String columnTypes = configuration.get(IOConstants.COLUMNS_TYPES);
columnTypesList = DataWritableReadSupport.getColumnTypes(columnTypes);
// if task.side.metadata is set, rowGroupOffsets is null
Object cacheKey = null;
CacheTag cacheTag = null;
// TODO: also support fileKey in splits, like OrcSplit does
if (metadataCache != null) {
if (cacheKey == null) {
cacheKey = HdfsUtils.getFileId(file.getFileSystem(configuration), file, HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_CACHE_ALLOW_SYNTHETIC_FILEID), HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_CACHE_DEFAULT_FS_FILE_ID), !HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_IO_USE_FILEID_PATH));
}
}
if (cacheKey != null) {
if (HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_TRACK_CACHE_USAGE)) {
PartitionDesc partitionDesc = LlapHiveUtils.partitionDescForPath(split.getPath(), parts);
cacheTag = LlapHiveUtils.getDbAndTableNameForMetrics(file, true, partitionDesc);
}
// If we are going to use cache, change the path to depend on file ID for extra consistency.
FileSystem fs = file.getFileSystem(configuration);
if (cacheKey instanceof Long && HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_IO_USE_FILEID_PATH)) {
file = HdfsUtils.getFileIdPath(file, (long) cacheKey);
}
}
if (rowGroupOffsets == null) {
// TODO check whether rowGroupOffSets can be null
// then we need to apply the predicate push down filter
footer = readSplitFooter(configuration, file, cacheKey, range(split.getStart(), split.getEnd()), cacheTag);
MessageType fileSchema = footer.getFileMetaData().getSchema();
FilterCompat.Filter filter = getFilter(configuration);
blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema);
} else {
// otherwise we find the row groups that were selected on the client
footer = readSplitFooter(configuration, file, cacheKey, NO_FILTER, cacheTag);
Set<Long> offsets = new HashSet<>();
for (long offset : rowGroupOffsets) {
offsets.add(offset);
}
blocks = new ArrayList<>();
for (BlockMetaData block : footer.getBlocks()) {
if (offsets.contains(block.getStartingPos())) {
blocks.add(block);
}
}
// verify we found them all
if (blocks.size() != rowGroupOffsets.length) {
long[] foundRowGroupOffsets = new long[footer.getBlocks().size()];
for (int i = 0; i < foundRowGroupOffsets.length; i++) {
foundRowGroupOffsets[i] = footer.getBlocks().get(i).getStartingPos();
}
// provide a good error message in case there's a bug
throw new IllegalStateException("All the offsets listed in the split should be found in the file." + " expected: " + Arrays.toString(rowGroupOffsets) + " found: " + blocks + " out of: " + Arrays.toString(foundRowGroupOffsets) + " in range " + split.getStart() + ", " + split.getEnd());
}
}
for (BlockMetaData block : blocks) {
this.totalRowCount += block.getRowCount();
}
this.fileSchema = footer.getFileMetaData().getSchema();
this.writerTimezone = DataWritableReadSupport.getWriterTimeZoneId(footer.getFileMetaData().getKeyValueMetaData());
colsToInclude = ColumnProjectionUtils.getReadColumnIDs(configuration);
requestedSchema = DataWritableReadSupport.getRequestedSchema(indexAccess, columnNamesList, columnTypesList, fileSchema, configuration);
Path path = wrapPathForCache(file, cacheKey, configuration, blocks, cacheTag);
this.reader = new ParquetFileReader(configuration, footer.getFileMetaData(), path, blocks, requestedSchema.getColumns());
}
use of org.apache.hadoop.hive.common.io.CacheTag in project hive by apache.
the class LlapCacheMetadataSerializer method loadData.
private void loadData(LlapDaemonProtocolProtos.CacheEntry ce) throws IOException {
CacheTag cacheTag = decodeCacheTag(ce.getCacheTag());
DiskRangeList ranges = decodeRanges(ce.getRangesList());
Object fileKey = decodeFileKey(ce.getFileKey());
try (LlapOrcCacheLoader llr = new LlapOrcCacheLoader(new Path(ce.getFilePath()), fileKey, conf, cache, metadataCache, cacheTag, tracePool)) {
llr.init();
llr.loadFileFooter();
llr.loadRanges(ranges);
}
}
use of org.apache.hadoop.hive.common.io.CacheTag in project hive by apache.
the class TestFileCache method testFileCacheMetadata.
@Test
public void testFileCacheMetadata() {
ConcurrentHashMap<Object, FileCache<Object>> cache = new ConcurrentHashMap<>();
Object fileKey = 1234L;
Function<Void, Object> f = a -> new Object();
CacheTag tag = CacheTag.build("test_table");
FileCache<Object> result = FileCache.getOrAddFileSubCache(cache, fileKey, f, tag);
assertEquals(fileKey, result.getFileKey());
assertEquals(tag, result.getTag());
}
use of org.apache.hadoop.hive.common.io.CacheTag in project hive by apache.
the class TestLowLevelCacheImpl method _testProactiveEvictionMark.
private void _testProactiveEvictionMark(boolean isInstantDeallocation) {
LowLevelCacheImpl cache = new LowLevelCacheImpl(LlapDaemonCacheMetrics.create("test", "1"), new DummyCachePolicy(), new DummyAllocator(), true, // no cleanup thread
-1);
long fn1 = 1;
long fn2 = 2;
LlapDataBuffer[] buffs1 = IntStream.range(0, 4).mapToObj(i -> fb()).toArray(LlapDataBuffer[]::new);
DiskRange[] drs1 = drs(IntStream.range(1, 5).toArray());
CacheTag tag1 = CacheTag.build("default.table1");
LlapDataBuffer[] buffs2 = IntStream.range(0, 41).mapToObj(i -> fb()).toArray(LlapDataBuffer[]::new);
DiskRange[] drs2 = drs(IntStream.range(1, 42).toArray());
CacheTag tag2 = CacheTag.build("default.table2");
Predicate<CacheTag> predicate = tag -> "default.table1".equals(tag.getTableName());
cache.putFileData(fn1, drs1, buffs1, 0, Priority.NORMAL, null, tag1);
cache.putFileData(fn2, drs2, buffs2, 0, Priority.NORMAL, null, tag2);
Arrays.stream(buffs1).forEach(b -> {
b.decRef();
b.decRef();
});
// Simulating eviction on a buffer
assertEquals(INVALIDATE_OK, buffs1[2].invalidate());
// buffs1[0,1,3] should be marked, as 2 is already invalidated
assertEquals(3, cache.markBuffersForProactiveEviction(predicate, isInstantDeallocation));
for (int i = 0; i < buffs1.length; ++i) {
LlapDataBuffer buffer = buffs1[i];
if (i == 2) {
assertFalse(buffer.isMarkedForEviction());
} else {
assertTrue(buffer.isMarkedForEviction());
assertEquals(isInstantDeallocation, buffer.isInvalid());
}
}
// All buffers for file2 should not be marked as per predicate
for (LlapDataBuffer buffer : buffs2) {
assertFalse(buffer.isMarkedForEviction());
}
}
use of org.apache.hadoop.hive.common.io.CacheTag in project hive by apache.
the class TestOrcMetadataCache method testGetOrcTailForPathWithFileId.
@Test
public void testGetOrcTailForPathWithFileId() throws Exception {
DummyMemoryManager mm = new DummyMemoryManager();
DummyCachePolicy cp = new DummyCachePolicy();
final int MAX_ALLOC = 64;
LlapDaemonCacheMetrics metrics = LlapDaemonCacheMetrics.create("", "");
BuddyAllocator alloc = new BuddyAllocator(false, false, 8, MAX_ALLOC, 1, 4 * 4096, 0, null, mm, metrics, null, true);
MetadataCache cache = new MetadataCache(alloc, mm, cp, true, metrics);
Path path = new Path("../data/files/alltypesorc");
Configuration jobConf = new Configuration();
Configuration daemonConf = new Configuration();
CacheTag tag = CacheTag.build("test-table");
FileSystem fs = FileSystem.get(daemonConf);
FileStatus fileStatus = fs.getFileStatus(path);
OrcTail uncached = OrcEncodedDataReader.getOrcTailForPath(fileStatus.getPath(), jobConf, tag, daemonConf, cache, new SyntheticFileId(fileStatus));
jobConf.set(HiveConf.ConfVars.LLAP_IO_CACHE_ONLY.varname, "true");
// this should work from the cache, by recalculating the same fileId
OrcTail cached = OrcEncodedDataReader.getOrcTailForPath(fileStatus.getPath(), jobConf, tag, daemonConf, cache, null);
assertEquals(uncached.getSerializedTail(), cached.getSerializedTail());
assertEquals(uncached.getFileTail(), cached.getFileTail());
}
Aggregations