use of org.apache.iceberg.FileFormat in project hive by apache.
the class HiveVectorizedReader method reader.
public static <D> CloseableIterable<D> reader(InputFile inputFile, FileScanTask task, Map<Integer, ?> idToConstant, TaskAttemptContext context) {
// Tweaks on jobConf here are relevant for this task only, so we need to copy it first as context's conf is reused..
JobConf job = new JobConf((JobConf) context.getConfiguration());
Path path = new Path(inputFile.location());
FileFormat format = task.file().format();
Reporter reporter = ((MapredIcebergInputFormat.CompatibilityTaskAttemptContextImpl) context).getLegacyReporter();
// Hive by default requires partition columns to be read too. This is not required for identity partition
// columns, as we will add this as constants later.
int[] partitionColIndices = null;
Object[] partitionValues = null;
PartitionSpec partitionSpec = task.spec();
List<Integer> readColumnIds = ColumnProjectionUtils.getReadColumnIDs(job);
if (!partitionSpec.isUnpartitioned()) {
List<PartitionField> fields = partitionSpec.fields();
List<Integer> partitionColIndicesList = Lists.newLinkedList();
List<Object> partitionValuesList = Lists.newLinkedList();
for (PartitionField partitionField : fields) {
if (partitionField.transform().isIdentity()) {
// Get columns in read schema order (which matches those of readColumnIds) to find partition column indices
List<Types.NestedField> columns = task.spec().schema().columns();
for (int colIdx = 0; colIdx < columns.size(); ++colIdx) {
if (columns.get(colIdx).fieldId() == partitionField.sourceId()) {
// Skip reading identity partition columns from source file...
readColumnIds.remove((Integer) colIdx);
// ...and use the corresponding constant value instead
partitionColIndicesList.add(colIdx);
partitionValuesList.add(idToConstant.get(partitionField.sourceId()));
break;
}
}
}
}
partitionColIndices = ArrayUtils.toPrimitive(partitionColIndicesList.toArray(new Integer[0]));
partitionValues = partitionValuesList.toArray(new Object[0]);
ColumnProjectionUtils.setReadColumns(job, readColumnIds);
}
try {
long start = task.start();
long length = task.length();
// TODO: Iceberg currently does not track the last modification time of a file. Until that's added,
// we need to set Long.MIN_VALUE as last modification time in the fileId triplet.
SyntheticFileId fileId = new SyntheticFileId(path, task.file().fileSizeInBytes(), Long.MIN_VALUE);
RecordReader<NullWritable, VectorizedRowBatch> recordReader = null;
switch(format) {
case ORC:
recordReader = orcRecordReader(job, reporter, task, inputFile, path, start, length, readColumnIds, fileId);
break;
case PARQUET:
recordReader = parquetRecordReader(job, reporter, task, path, start, length);
break;
default:
throw new UnsupportedOperationException("Vectorized Hive reading unimplemented for format: " + format);
}
return createVectorizedRowBatchIterable(recordReader, job, partitionColIndices, partitionValues);
} catch (IOException ioe) {
throw new RuntimeException("Error creating vectorized record reader for " + inputFile, ioe);
}
}
use of org.apache.iceberg.FileFormat in project hive by apache.
the class HiveIcebergTestUtils method createEqualityDeleteFile.
/**
* @param table The table to create the delete file for
* @param deleteFilePath The path where the delete file should be created, relative to the table location root
* @param equalityFields List of field names that should play a role in the equality check
* @param fileFormat The file format that should be used for writing out the delete file
* @param rowsToDelete The rows that should be deleted. It's enough to fill out the fields that are relevant for the
* equality check, as listed in equalityFields, the rest of the fields are ignored
* @return The DeleteFile created
* @throws IOException If there is an error during DeleteFile write
*/
public static DeleteFile createEqualityDeleteFile(Table table, String deleteFilePath, List<String> equalityFields, FileFormat fileFormat, List<Record> rowsToDelete) throws IOException {
List<Integer> equalityFieldIds = equalityFields.stream().map(id -> table.schema().findField(id).fieldId()).collect(Collectors.toList());
Schema eqDeleteRowSchema = table.schema().select(equalityFields.toArray(new String[] {}));
FileAppenderFactory<Record> appenderFactory = new GenericAppenderFactory(table.schema(), table.spec(), ArrayUtil.toIntArray(equalityFieldIds), eqDeleteRowSchema, null);
EncryptedOutputFile outputFile = table.encryption().encrypt(HadoopOutputFile.fromPath(new org.apache.hadoop.fs.Path(table.location(), deleteFilePath), new Configuration()));
PartitionKey part = new PartitionKey(table.spec(), eqDeleteRowSchema);
part.partition(rowsToDelete.get(0));
EqualityDeleteWriter<Record> eqWriter = appenderFactory.newEqDeleteWriter(outputFile, fileFormat, part);
try (EqualityDeleteWriter<Record> writer = eqWriter) {
writer.deleteAll(rowsToDelete);
}
return eqWriter.toDeleteFile();
}
use of org.apache.iceberg.FileFormat in project hive by apache.
the class TestHiveIcebergV2 method testReadAndWriteFormatV2Partitioned_PosDelete_RowNotSupplied.
@Test
public void testReadAndWriteFormatV2Partitioned_PosDelete_RowNotSupplied() throws IOException {
Assume.assumeFalse("Reading V2 tables with delete files are only supported currently in " + "non-vectorized mode and only Parquet/Avro", isVectorized || fileFormat == FileFormat.ORC);
PartitionSpec spec = PartitionSpec.builderFor(HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA).identity("customer_id").build();
Table tbl = testTables.createTable(shell, "customers", HiveIcebergStorageHandlerTestUtils.CUSTOMER_SCHEMA, spec, fileFormat, HiveIcebergStorageHandlerTestUtils.CUSTOMER_RECORDS, 2);
// add some more data to the same partition
shell.executeStatement("insert into customers values (0, 'Laura', 'Yellow'), (0, 'John', 'Green'), " + "(0, 'Blake', 'Blue')");
tbl.refresh();
// delete the first and third rows from the newly-added data file - with row supplied
DataFile dataFile = StreamSupport.stream(tbl.currentSnapshot().addedFiles().spliterator(), false).filter(file -> file.partition().get(0, Long.class) == 0L).filter(file -> file.recordCount() == 3).findAny().orElseThrow(() -> new RuntimeException("Did not find the desired data file in the test table"));
List<PositionDelete<Record>> deletes = ImmutableList.of(positionDelete(dataFile.path(), 0L, null), positionDelete(dataFile.path(), 2L, null));
DeleteFile deleteFile = HiveIcebergTestUtils.createPositionalDeleteFile(tbl, "dummyPath", fileFormat, ImmutableMap.of("customer_id", 0L), deletes);
tbl.newRowDelta().addDeletes(deleteFile).commit();
List<Object[]> objects = shell.executeStatement("SELECT * FROM customers ORDER BY customer_id, first_name");
Assert.assertEquals(4, objects.size());
Assert.assertArrayEquals(new Object[] { 0L, "Alice", "Brown" }, objects.get(0));
Assert.assertArrayEquals(new Object[] { 0L, "John", "Green" }, objects.get(1));
Assert.assertArrayEquals(new Object[] { 1L, "Bob", "Green" }, objects.get(2));
Assert.assertArrayEquals(new Object[] { 2L, "Trudy", "Pink" }, objects.get(3));
}
use of org.apache.iceberg.FileFormat in project hive by apache.
the class TestHiveIcebergOutputCommitter method writeRecords.
/**
* Write random records to the given table using separate {@link HiveIcebergOutputCommitter} and
* a separate {@link HiveIcebergRecordWriter} for every task.
* @param name The name of the table to get the table object from the conf
* @param taskNum The number of tasks in the job handled by the committer
* @param attemptNum The id used for attempt number generation
* @param commitTasks If <code>true</code> the tasks will be committed
* @param abortTasks If <code>true</code> the tasks will be aborted - needed so we can simulate no commit/no abort
* situation
* @param conf The job configuration
* @param committer The output committer that should be used for committing/aborting the tasks
* @return The random generated records which were appended to the table
* @throws IOException Propagating {@link HiveIcebergRecordWriter} exceptions
*/
private List<Record> writeRecords(String name, int taskNum, int attemptNum, boolean commitTasks, boolean abortTasks, JobConf conf, OutputCommitter committer) throws IOException {
List<Record> expected = new ArrayList<>(RECORD_NUM * taskNum);
Table table = HiveIcebergStorageHandler.table(conf, name);
FileIO io = table.io();
Schema schema = HiveIcebergStorageHandler.schema(conf);
PartitionSpec spec = table.spec();
for (int i = 0; i < taskNum; ++i) {
List<Record> records = TestHelper.generateRandomRecords(schema, RECORD_NUM, i + attemptNum);
TaskAttemptID taskId = new TaskAttemptID(JOB_ID.getJtIdentifier(), JOB_ID.getId(), TaskType.MAP, i, attemptNum);
int partitionId = taskId.getTaskID().getId();
String operationId = QUERY_ID + "-" + JOB_ID;
FileFormat fileFormat = FileFormat.PARQUET;
OutputFileFactory outputFileFactory = OutputFileFactory.builderFor(table, partitionId, attemptNum).format(fileFormat).operationId(operationId).build();
HiveFileWriterFactory hfwf = new HiveFileWriterFactory(table, fileFormat, schema, null, fileFormat, null, null, null, null);
HiveIcebergRecordWriter testWriter = new HiveIcebergRecordWriter(schema, spec, fileFormat, hfwf, outputFileFactory, io, TARGET_FILE_SIZE, TezUtil.taskAttemptWrapper(taskId), conf.get(Catalogs.NAME));
Container<Record> container = new Container<>();
for (Record record : records) {
container.set(record);
testWriter.write(container);
}
testWriter.close(false);
if (commitTasks) {
committer.commitTask(new TaskAttemptContextImpl(conf, taskId));
expected.addAll(records);
} else if (abortTasks) {
committer.abortTask(new TaskAttemptContextImpl(conf, taskId));
}
}
return expected;
}
use of org.apache.iceberg.FileFormat in project presto by prestodb.
the class IcebergPageSink method createWriter.
private WriteContext createWriter(Optional<PartitionData> partitionData) {
String fileName = fileFormat.addExtension(randomUUID().toString());
Path outputPath = partitionData.map(partition -> new Path(locationProvider.newDataLocation(partitionSpec, partition, fileName))).orElse(new Path(locationProvider.newDataLocation(fileName)));
IcebergFileWriter writer = fileWriterFactory.createFileWriter(outputPath, outputSchema, jobConf, session, hdfsContext, fileFormat);
return new WriteContext(writer, outputPath, partitionData);
}
Aggregations