use of org.apache.hadoop.hive.ql.io.SyntheticFileId in project hive by apache.
the class LlapCacheMetadataSerializer method decodeFileKey.
/**
* If the underlying filesystem supports it, the file key can be a unique file/inode ID represented by a long,
* otherwise its a combination of the path hash, the modification time and the length of the file.
*
* @see org.apache.hadoop.hive.llap.io.encoded.OrcEncodedDataReader#determineFileId
*/
@VisibleForTesting
static Object decodeFileKey(ByteString encodedFileKey) throws IOException {
byte[] bytes = encodedFileKey.toByteArray();
DataInput in = new DataInputStream(new ByteArrayInputStream(bytes));
Object fileKey;
if (bytes.length == Long.BYTES) {
fileKey = in.readLong();
} else {
SyntheticFileId fileId = new SyntheticFileId();
fileId.readFields(in);
fileKey = fileId;
}
return fileKey;
}
use of org.apache.hadoop.hive.ql.io.SyntheticFileId in project hive by apache.
the class TestOrcMetadataCache method testGetOrcTailForPathWithFileId.
@Test
public void testGetOrcTailForPathWithFileId() throws Exception {
DummyMemoryManager mm = new DummyMemoryManager();
DummyCachePolicy cp = new DummyCachePolicy();
final int MAX_ALLOC = 64;
LlapDaemonCacheMetrics metrics = LlapDaemonCacheMetrics.create("", "");
BuddyAllocator alloc = new BuddyAllocator(false, false, 8, MAX_ALLOC, 1, 4 * 4096, 0, null, mm, metrics, null, true);
MetadataCache cache = new MetadataCache(alloc, mm, cp, true, metrics);
Path path = new Path("../data/files/alltypesorc");
Configuration jobConf = new Configuration();
Configuration daemonConf = new Configuration();
CacheTag tag = CacheTag.build("test-table");
FileSystem fs = FileSystem.get(daemonConf);
FileStatus fileStatus = fs.getFileStatus(path);
OrcTail uncached = OrcEncodedDataReader.getOrcTailForPath(fileStatus.getPath(), jobConf, tag, daemonConf, cache, new SyntheticFileId(fileStatus));
jobConf.set(HiveConf.ConfVars.LLAP_IO_CACHE_ONLY.varname, "true");
// this should work from the cache, by recalculating the same fileId
OrcTail cached = OrcEncodedDataReader.getOrcTailForPath(fileStatus.getPath(), jobConf, tag, daemonConf, cache, null);
assertEquals(uncached.getSerializedTail(), cached.getSerializedTail());
assertEquals(uncached.getFileTail(), cached.getFileTail());
}
use of org.apache.hadoop.hive.ql.io.SyntheticFileId in project hive by apache.
the class HiveVectorizedReader method reader.
public static <D> CloseableIterable<D> reader(InputFile inputFile, FileScanTask task, Map<Integer, ?> idToConstant, TaskAttemptContext context) {
// Tweaks on jobConf here are relevant for this task only, so we need to copy it first as context's conf is reused..
JobConf job = new JobConf((JobConf) context.getConfiguration());
Path path = new Path(inputFile.location());
FileFormat format = task.file().format();
Reporter reporter = ((MapredIcebergInputFormat.CompatibilityTaskAttemptContextImpl) context).getLegacyReporter();
// Hive by default requires partition columns to be read too. This is not required for identity partition
// columns, as we will add this as constants later.
int[] partitionColIndices = null;
Object[] partitionValues = null;
PartitionSpec partitionSpec = task.spec();
List<Integer> readColumnIds = ColumnProjectionUtils.getReadColumnIDs(job);
if (!partitionSpec.isUnpartitioned()) {
List<PartitionField> fields = partitionSpec.fields();
List<Integer> partitionColIndicesList = Lists.newLinkedList();
List<Object> partitionValuesList = Lists.newLinkedList();
for (PartitionField partitionField : fields) {
if (partitionField.transform().isIdentity()) {
// Get columns in read schema order (which matches those of readColumnIds) to find partition column indices
List<Types.NestedField> columns = task.spec().schema().columns();
for (int colIdx = 0; colIdx < columns.size(); ++colIdx) {
if (columns.get(colIdx).fieldId() == partitionField.sourceId()) {
// Skip reading identity partition columns from source file...
readColumnIds.remove((Integer) colIdx);
// ...and use the corresponding constant value instead
partitionColIndicesList.add(colIdx);
partitionValuesList.add(idToConstant.get(partitionField.sourceId()));
break;
}
}
}
}
partitionColIndices = ArrayUtils.toPrimitive(partitionColIndicesList.toArray(new Integer[0]));
partitionValues = partitionValuesList.toArray(new Object[0]);
ColumnProjectionUtils.setReadColumns(job, readColumnIds);
}
try {
long start = task.start();
long length = task.length();
// TODO: Iceberg currently does not track the last modification time of a file. Until that's added,
// we need to set Long.MIN_VALUE as last modification time in the fileId triplet.
SyntheticFileId fileId = new SyntheticFileId(path, task.file().fileSizeInBytes(), Long.MIN_VALUE);
RecordReader<NullWritable, VectorizedRowBatch> recordReader = null;
switch(format) {
case ORC:
recordReader = orcRecordReader(job, reporter, task, inputFile, path, start, length, readColumnIds, fileId);
break;
case PARQUET:
recordReader = parquetRecordReader(job, reporter, task, path, start, length);
break;
default:
throw new UnsupportedOperationException("Vectorized Hive reading unimplemented for format: " + format);
}
return createVectorizedRowBatchIterable(recordReader, job, partitionColIndices, partitionValues);
} catch (IOException ioe) {
throw new RuntimeException("Error creating vectorized record reader for " + inputFile, ioe);
}
}
Aggregations