Search in sources :

Example 6 with CacheTag

use of org.apache.hadoop.hive.common.io.CacheTag in project hive by apache.

the class VectorizedParquetRecordReader method initialize.

@SuppressWarnings("deprecation")
public void initialize(InputSplit oldSplit, JobConf configuration) throws IOException, InterruptedException, HiveException {
    // the oldSplit may be null during the split phase
    if (oldSplit == null) {
        return;
    }
    ParquetMetadata footer;
    List<BlockMetaData> blocks;
    MapWork mapWork = LlapHiveUtils.findMapWork(jobConf);
    if (mapWork != null) {
        parts = mapWork.getPathToPartitionInfo();
    }
    ParquetInputSplit split = (ParquetInputSplit) oldSplit;
    boolean indexAccess = configuration.getBoolean(DataWritableReadSupport.PARQUET_COLUMN_INDEX_ACCESS, false);
    this.file = split.getPath();
    long[] rowGroupOffsets = split.getRowGroupOffsets();
    String columnNames = configuration.get(IOConstants.COLUMNS);
    columnNamesList = DataWritableReadSupport.getColumnNames(columnNames);
    String columnTypes = configuration.get(IOConstants.COLUMNS_TYPES);
    columnTypesList = DataWritableReadSupport.getColumnTypes(columnTypes);
    // if task.side.metadata is set, rowGroupOffsets is null
    Object cacheKey = null;
    CacheTag cacheTag = null;
    // TODO: also support fileKey in splits, like OrcSplit does
    if (metadataCache != null) {
        if (cacheKey == null) {
            cacheKey = HdfsUtils.getFileId(file.getFileSystem(configuration), file, HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_CACHE_ALLOW_SYNTHETIC_FILEID), HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_CACHE_DEFAULT_FS_FILE_ID), !HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_IO_USE_FILEID_PATH));
        }
    }
    if (cacheKey != null) {
        if (HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_TRACK_CACHE_USAGE)) {
            PartitionDesc partitionDesc = LlapHiveUtils.partitionDescForPath(split.getPath(), parts);
            cacheTag = LlapHiveUtils.getDbAndTableNameForMetrics(file, true, partitionDesc);
        }
        // If we are going to use cache, change the path to depend on file ID for extra consistency.
        FileSystem fs = file.getFileSystem(configuration);
        if (cacheKey instanceof Long && HiveConf.getBoolVar(cacheConf, ConfVars.LLAP_IO_USE_FILEID_PATH)) {
            file = HdfsUtils.getFileIdPath(file, (long) cacheKey);
        }
    }
    if (rowGroupOffsets == null) {
        // TODO check whether rowGroupOffSets can be null
        // then we need to apply the predicate push down filter
        footer = readSplitFooter(configuration, file, cacheKey, range(split.getStart(), split.getEnd()), cacheTag);
        MessageType fileSchema = footer.getFileMetaData().getSchema();
        FilterCompat.Filter filter = getFilter(configuration);
        blocks = filterRowGroups(filter, footer.getBlocks(), fileSchema);
    } else {
        // otherwise we find the row groups that were selected on the client
        footer = readSplitFooter(configuration, file, cacheKey, NO_FILTER, cacheTag);
        Set<Long> offsets = new HashSet<>();
        for (long offset : rowGroupOffsets) {
            offsets.add(offset);
        }
        blocks = new ArrayList<>();
        for (BlockMetaData block : footer.getBlocks()) {
            if (offsets.contains(block.getStartingPos())) {
                blocks.add(block);
            }
        }
        // verify we found them all
        if (blocks.size() != rowGroupOffsets.length) {
            long[] foundRowGroupOffsets = new long[footer.getBlocks().size()];
            for (int i = 0; i < foundRowGroupOffsets.length; i++) {
                foundRowGroupOffsets[i] = footer.getBlocks().get(i).getStartingPos();
            }
            // provide a good error message in case there's a bug
            throw new IllegalStateException("All the offsets listed in the split should be found in the file." + " expected: " + Arrays.toString(rowGroupOffsets) + " found: " + blocks + " out of: " + Arrays.toString(foundRowGroupOffsets) + " in range " + split.getStart() + ", " + split.getEnd());
        }
    }
    for (BlockMetaData block : blocks) {
        this.totalRowCount += block.getRowCount();
    }
    this.fileSchema = footer.getFileMetaData().getSchema();
    this.writerTimezone = DataWritableReadSupport.getWriterTimeZoneId(footer.getFileMetaData().getKeyValueMetaData());
    colsToInclude = ColumnProjectionUtils.getReadColumnIDs(configuration);
    requestedSchema = DataWritableReadSupport.getRequestedSchema(indexAccess, columnNamesList, columnTypesList, fileSchema, configuration);
    Path path = wrapPathForCache(file, cacheKey, configuration, blocks, cacheTag);
    this.reader = new ParquetFileReader(configuration, footer.getFileMetaData(), path, blocks, requestedSchema.getColumns());
}
Also used : Path(org.apache.hadoop.fs.Path) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) FilterCompat(org.apache.parquet.filter2.compat.FilterCompat) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) FileSystem(org.apache.hadoop.fs.FileSystem) ParquetInputSplit(org.apache.parquet.hadoop.ParquetInputSplit) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) CacheTag(org.apache.hadoop.hive.common.io.CacheTag) MessageType(org.apache.parquet.schema.MessageType) HashSet(java.util.HashSet)

Example 7 with CacheTag

use of org.apache.hadoop.hive.common.io.CacheTag in project hive by apache.

the class LlapCacheMetadataSerializer method loadData.

private void loadData(LlapDaemonProtocolProtos.CacheEntry ce) throws IOException {
    CacheTag cacheTag = decodeCacheTag(ce.getCacheTag());
    DiskRangeList ranges = decodeRanges(ce.getRangesList());
    Object fileKey = decodeFileKey(ce.getFileKey());
    try (LlapOrcCacheLoader llr = new LlapOrcCacheLoader(new Path(ce.getFilePath()), fileKey, conf, cache, metadataCache, cacheTag, tracePool)) {
        llr.init();
        llr.loadFileFooter();
        llr.loadRanges(ranges);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) DiskRangeList(org.apache.hadoop.hive.common.io.DiskRangeList) CacheTag(org.apache.hadoop.hive.common.io.CacheTag) LlapOrcCacheLoader(org.apache.hadoop.hive.llap.io.encoded.LlapOrcCacheLoader)

Example 8 with CacheTag

use of org.apache.hadoop.hive.common.io.CacheTag in project hive by apache.

the class TestFileCache method testFileCacheMetadata.

@Test
public void testFileCacheMetadata() {
    ConcurrentHashMap<Object, FileCache<Object>> cache = new ConcurrentHashMap<>();
    Object fileKey = 1234L;
    Function<Void, Object> f = a -> new Object();
    CacheTag tag = CacheTag.build("test_table");
    FileCache<Object> result = FileCache.getOrAddFileSubCache(cache, fileKey, f, tag);
    assertEquals(fileKey, result.getFileKey());
    assertEquals(tag, result.getTag());
}
Also used : Function(com.google.common.base.Function) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) Test(org.junit.Test) CacheTag(org.apache.hadoop.hive.common.io.CacheTag) Assert.assertEquals(org.junit.Assert.assertEquals) CacheTag(org.apache.hadoop.hive.common.io.CacheTag) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) Test(org.junit.Test)

Example 9 with CacheTag

use of org.apache.hadoop.hive.common.io.CacheTag in project hive by apache.

the class TestLowLevelCacheImpl method _testProactiveEvictionMark.

private void _testProactiveEvictionMark(boolean isInstantDeallocation) {
    LowLevelCacheImpl cache = new LowLevelCacheImpl(LlapDaemonCacheMetrics.create("test", "1"), new DummyCachePolicy(), new DummyAllocator(), true, // no cleanup thread
    -1);
    long fn1 = 1;
    long fn2 = 2;
    LlapDataBuffer[] buffs1 = IntStream.range(0, 4).mapToObj(i -> fb()).toArray(LlapDataBuffer[]::new);
    DiskRange[] drs1 = drs(IntStream.range(1, 5).toArray());
    CacheTag tag1 = CacheTag.build("default.table1");
    LlapDataBuffer[] buffs2 = IntStream.range(0, 41).mapToObj(i -> fb()).toArray(LlapDataBuffer[]::new);
    DiskRange[] drs2 = drs(IntStream.range(1, 42).toArray());
    CacheTag tag2 = CacheTag.build("default.table2");
    Predicate<CacheTag> predicate = tag -> "default.table1".equals(tag.getTableName());
    cache.putFileData(fn1, drs1, buffs1, 0, Priority.NORMAL, null, tag1);
    cache.putFileData(fn2, drs2, buffs2, 0, Priority.NORMAL, null, tag2);
    Arrays.stream(buffs1).forEach(b -> {
        b.decRef();
        b.decRef();
    });
    // Simulating eviction on a buffer
    assertEquals(INVALIDATE_OK, buffs1[2].invalidate());
    // buffs1[0,1,3] should be marked, as 2 is already invalidated
    assertEquals(3, cache.markBuffersForProactiveEviction(predicate, isInstantDeallocation));
    for (int i = 0; i < buffs1.length; ++i) {
        LlapDataBuffer buffer = buffs1[i];
        if (i == 2) {
            assertFalse(buffer.isMarkedForEviction());
        } else {
            assertTrue(buffer.isMarkedForEviction());
            assertEquals(isInstantDeallocation, buffer.isInvalid());
        }
    }
    // All buffers for file2 should not be marked as per predicate
    for (LlapDataBuffer buffer : buffs2) {
        assertFalse(buffer.isMarkedForEviction());
    }
}
Also used : IntStream(java.util.stream.IntStream) Arrays(java.util.Arrays) LoggerFactory(org.slf4j.LoggerFactory) FutureTask(java.util.concurrent.FutureTask) Random(java.util.Random) Callable(java.util.concurrent.Callable) MemoryBuffer(org.apache.hadoop.hive.common.io.encoded.MemoryBuffer) Priority(org.apache.hadoop.hive.llap.cache.LowLevelCache.Priority) Assert.assertSame(org.junit.Assert.assertSame) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) CacheTag(org.apache.hadoop.hive.common.io.CacheTag) DiskRange(org.apache.hadoop.hive.common.io.DiskRange) DiskRangeList(org.apache.hadoop.hive.common.io.DiskRangeList) Logger(org.slf4j.Logger) Executor(java.util.concurrent.Executor) INVALIDATE_OK(org.apache.hadoop.hive.llap.cache.LlapCacheableBuffer.INVALIDATE_OK) Predicate(java.util.function.Predicate) Assert.assertTrue(org.junit.Assert.assertTrue) Test(org.junit.Test) CreateHelper(org.apache.hadoop.hive.common.io.DiskRangeList.CreateHelper) CacheChunk(org.apache.hadoop.hive.ql.io.orc.encoded.CacheChunk) Executors(java.util.concurrent.Executors) DiskRangeListFactory(org.apache.hadoop.hive.common.io.DataCache.DiskRangeListFactory) CountDownLatch(java.util.concurrent.CountDownLatch) Assert.assertNull(org.junit.Assert.assertNull) Assert.assertFalse(org.junit.Assert.assertFalse) LlapDaemonCacheMetrics(org.apache.hadoop.hive.llap.metrics.LlapDaemonCacheMetrics) Assert.assertEquals(org.junit.Assert.assertEquals) CacheTag(org.apache.hadoop.hive.common.io.CacheTag) DiskRange(org.apache.hadoop.hive.common.io.DiskRange)

Example 10 with CacheTag

use of org.apache.hadoop.hive.common.io.CacheTag in project hive by apache.

the class TestOrcMetadataCache method testGetOrcTailForPathWithFileId.

@Test
public void testGetOrcTailForPathWithFileId() throws Exception {
    DummyMemoryManager mm = new DummyMemoryManager();
    DummyCachePolicy cp = new DummyCachePolicy();
    final int MAX_ALLOC = 64;
    LlapDaemonCacheMetrics metrics = LlapDaemonCacheMetrics.create("", "");
    BuddyAllocator alloc = new BuddyAllocator(false, false, 8, MAX_ALLOC, 1, 4 * 4096, 0, null, mm, metrics, null, true);
    MetadataCache cache = new MetadataCache(alloc, mm, cp, true, metrics);
    Path path = new Path("../data/files/alltypesorc");
    Configuration jobConf = new Configuration();
    Configuration daemonConf = new Configuration();
    CacheTag tag = CacheTag.build("test-table");
    FileSystem fs = FileSystem.get(daemonConf);
    FileStatus fileStatus = fs.getFileStatus(path);
    OrcTail uncached = OrcEncodedDataReader.getOrcTailForPath(fileStatus.getPath(), jobConf, tag, daemonConf, cache, new SyntheticFileId(fileStatus));
    jobConf.set(HiveConf.ConfVars.LLAP_IO_CACHE_ONLY.varname, "true");
    // this should work from the cache, by recalculating the same fileId
    OrcTail cached = OrcEncodedDataReader.getOrcTailForPath(fileStatus.getPath(), jobConf, tag, daemonConf, cache, null);
    assertEquals(uncached.getSerializedTail(), cached.getSerializedTail());
    assertEquals(uncached.getFileTail(), cached.getFileTail());
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) Configuration(org.apache.hadoop.conf.Configuration) SyntheticFileId(org.apache.hadoop.hive.ql.io.SyntheticFileId) MetadataCache(org.apache.hadoop.hive.llap.io.metadata.MetadataCache) LlapDaemonCacheMetrics(org.apache.hadoop.hive.llap.metrics.LlapDaemonCacheMetrics) FileSystem(org.apache.hadoop.fs.FileSystem) CacheTag(org.apache.hadoop.hive.common.io.CacheTag) OrcTail(org.apache.orc.impl.OrcTail) Test(org.junit.Test)

Aggregations

CacheTag (org.apache.hadoop.hive.common.io.CacheTag)12 Test (org.junit.Test)8 Path (org.apache.hadoop.fs.Path)7 LlapDaemonCacheMetrics (org.apache.hadoop.hive.llap.metrics.LlapDaemonCacheMetrics)5 Configuration (org.apache.hadoop.conf.Configuration)4 MetadataCache (org.apache.hadoop.hive.llap.io.metadata.MetadataCache)4 IOException (java.io.IOException)3 FileSystem (org.apache.hadoop.fs.FileSystem)3 DiskRangeList (org.apache.hadoop.hive.common.io.DiskRangeList)3 SyntheticFileId (org.apache.hadoop.hive.ql.io.SyntheticFileId)3 OrcTail (org.apache.orc.impl.OrcTail)3 ByteBuffer (java.nio.ByteBuffer)2 Random (java.util.Random)2 Predicate (java.util.function.Predicate)2 FileStatus (org.apache.hadoop.fs.FileStatus)2 DiskRange (org.apache.hadoop.hive.common.io.DiskRange)2 IllegalCacheConfigurationException (org.apache.hadoop.hive.llap.IllegalCacheConfigurationException)2 INVALIDATE_OK (org.apache.hadoop.hive.llap.cache.LlapCacheableBuffer.INVALIDATE_OK)2 Priority (org.apache.hadoop.hive.llap.cache.LowLevelCache.Priority)2 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)2