use of org.apache.spark.sql.Row$ in project iceberg by apache.
the class BaseRewriteManifestsSparkAction method writeManifest.
private static ManifestFile writeManifest(List<Row> rows, int startIndex, int endIndex, Broadcast<FileIO> io, String location, int format, PartitionSpec spec, StructType sparkType) throws IOException {
String manifestName = "optimized-m-" + UUID.randomUUID();
Path manifestPath = new Path(location, manifestName);
OutputFile outputFile = io.value().newOutputFile(FileFormat.AVRO.addExtension(manifestPath.toString()));
Types.StructType dataFileType = DataFile.getType(spec.partitionType());
SparkDataFile wrapper = new SparkDataFile(dataFileType, sparkType);
ManifestWriter<DataFile> writer = ManifestFiles.write(format, spec, outputFile, null);
try {
for (int index = startIndex; index < endIndex; index++) {
Row row = rows.get(index);
long snapshotId = row.getLong(0);
long sequenceNumber = row.getLong(1);
Row file = row.getStruct(2);
writer.existing(wrapper.wrap(file), snapshotId, sequenceNumber);
}
} finally {
writer.close();
}
return writer.toManifestFile();
}
use of org.apache.spark.sql.Row$ in project iceberg by apache.
the class BaseRewriteManifestsSparkAction method doExecute.
private RewriteManifests.Result doExecute() {
List<ManifestFile> matchingManifests = findMatchingManifests();
if (matchingManifests.isEmpty()) {
return BaseRewriteManifestsActionResult.empty();
}
long totalSizeBytes = 0L;
int numEntries = 0;
for (ManifestFile manifest : matchingManifests) {
ValidationException.check(hasFileCounts(manifest), "No file counts in manifest: %s", manifest.path());
totalSizeBytes += manifest.length();
numEntries += manifest.addedFilesCount() + manifest.existingFilesCount() + manifest.deletedFilesCount();
}
int targetNumManifests = targetNumManifests(totalSizeBytes);
int targetNumManifestEntries = targetNumManifestEntries(numEntries, targetNumManifests);
Dataset<Row> manifestEntryDF = buildManifestEntryDF(matchingManifests);
List<ManifestFile> newManifests;
if (spec.fields().size() < 1) {
newManifests = writeManifestsForUnpartitionedTable(manifestEntryDF, targetNumManifests);
} else {
newManifests = writeManifestsForPartitionedTable(manifestEntryDF, targetNumManifests, targetNumManifestEntries);
}
replaceManifests(matchingManifests, newManifests);
return new BaseRewriteManifestsActionResult(matchingManifests, newManifests);
}
use of org.apache.spark.sql.Row$ in project iceberg by apache.
the class Spark3BinPackStrategy method rewriteFiles.
@Override
public Set<DataFile> rewriteFiles(List<FileScanTask> filesToRewrite) {
String groupID = UUID.randomUUID().toString();
try {
manager.stageTasks(table, groupID, filesToRewrite);
// Disable Adaptive Query Execution as this may change the output partitioning of our write
SparkSession cloneSession = spark.cloneSession();
cloneSession.conf().set(SQLConf.ADAPTIVE_EXECUTION_ENABLED().key(), false);
Dataset<Row> scanDF = cloneSession.read().format("iceberg").option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, groupID).option(SparkReadOptions.SPLIT_SIZE, splitSize(inputFileSize(filesToRewrite))).option(SparkReadOptions.FILE_OPEN_COST, "0").load(table.name());
// All files within a file group are written with the same spec, so check the first
boolean requiresRepartition = !filesToRewrite.get(0).spec().equals(table.spec());
// Invoke a shuffle if the partition spec of the incoming partition does not match the table
String distributionMode = requiresRepartition ? DistributionMode.RANGE.modeName() : DistributionMode.NONE.modeName();
// write the packed data into new files where each split becomes a new file
scanDF.write().format("iceberg").option(SparkWriteOptions.REWRITTEN_FILE_SCAN_TASK_SET_ID, groupID).option(SparkWriteOptions.TARGET_FILE_SIZE_BYTES, writeMaxFileSize()).option(SparkWriteOptions.DISTRIBUTION_MODE, distributionMode).mode("append").save(table.name());
return rewriteCoordinator.fetchNewDataFiles(table, groupID);
} finally {
manager.removeTasks(table, groupID);
rewriteCoordinator.clearRewrite(table, groupID);
}
}
use of org.apache.spark.sql.Row$ in project iceberg by apache.
the class TestCreateActions method validateTables.
private void validateTables(String source, String dest) throws NoSuchTableException, ParseException {
List<Row> expected = spark.table(source).collectAsList();
SparkTable destTable = loadTable(dest);
Assert.assertEquals("Provider should be iceberg", "iceberg", destTable.properties().get(TableCatalog.PROP_PROVIDER));
List<Row> actual = spark.table(dest).collectAsList();
Assert.assertTrue(String.format("Rows in migrated table did not match\nExpected :%s rows \nFound :%s", expected, actual), expected.containsAll(actual) && actual.containsAll(expected));
}
use of org.apache.spark.sql.Row$ in project iceberg by apache.
the class TestFileRewriteCoordinator method testBinPackRewrite.
@Test
public void testBinPackRewrite() throws NoSuchTableException, IOException {
sql("CREATE TABLE %s (id INT, data STRING) USING iceberg", tableName);
Dataset<Row> df = newDF(1000);
df.coalesce(1).writeTo(tableName).append();
df.coalesce(1).writeTo(tableName).append();
df.coalesce(1).writeTo(tableName).append();
df.coalesce(1).writeTo(tableName).append();
Table table = validationCatalog.loadTable(tableIdent);
Assert.assertEquals("Should produce 4 snapshots", 4, Iterables.size(table.snapshots()));
Dataset<Row> fileDF = spark.read().format("iceberg").load(tableName(tableIdent.name() + ".files"));
List<Long> fileSizes = fileDF.select("file_size_in_bytes").as(Encoders.LONG()).collectAsList();
long avgFileSize = fileSizes.stream().mapToLong(i -> i).sum() / fileSizes.size();
try (CloseableIterable<FileScanTask> fileScanTasks = table.newScan().planFiles()) {
String fileSetID = UUID.randomUUID().toString();
FileScanTaskSetManager taskSetManager = FileScanTaskSetManager.get();
taskSetManager.stageTasks(table, fileSetID, Lists.newArrayList(fileScanTasks));
// read and pack original 4 files into 2 splits
Dataset<Row> scanDF = spark.read().format("iceberg").option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, fileSetID).option(SparkReadOptions.SPLIT_SIZE, Long.toString(avgFileSize * 2)).option(SparkReadOptions.FILE_OPEN_COST, "0").load(tableName);
// write the packed data into new files where each split becomes a new file
scanDF.writeTo(tableName).option(SparkWriteOptions.REWRITTEN_FILE_SCAN_TASK_SET_ID, fileSetID).append();
// commit the rewrite
FileRewriteCoordinator rewriteCoordinator = FileRewriteCoordinator.get();
Set<DataFile> rewrittenFiles = taskSetManager.fetchTasks(table, fileSetID).stream().map(FileScanTask::file).collect(Collectors.toSet());
Set<DataFile> addedFiles = rewriteCoordinator.fetchNewDataFiles(table, fileSetID);
table.newRewrite().rewriteFiles(rewrittenFiles, addedFiles).commit();
}
table.refresh();
Map<String, String> summary = table.currentSnapshot().summary();
Assert.assertEquals("Deleted files count must match", "4", summary.get("deleted-data-files"));
Assert.assertEquals("Added files count must match", "2", summary.get("added-data-files"));
Object rowCount = scalarSql("SELECT count(*) FROM %s", tableName);
Assert.assertEquals("Row count must match", 4000L, rowCount);
}
Aggregations