Search in sources :

Example 26 with Row$

use of org.apache.spark.sql.Row$ in project iceberg by apache.

the class BaseRewriteManifestsSparkAction method writeManifest.

private static ManifestFile writeManifest(List<Row> rows, int startIndex, int endIndex, Broadcast<FileIO> io, String location, int format, PartitionSpec spec, StructType sparkType) throws IOException {
    String manifestName = "optimized-m-" + UUID.randomUUID();
    Path manifestPath = new Path(location, manifestName);
    OutputFile outputFile = io.value().newOutputFile(FileFormat.AVRO.addExtension(manifestPath.toString()));
    Types.StructType dataFileType = DataFile.getType(spec.partitionType());
    SparkDataFile wrapper = new SparkDataFile(dataFileType, sparkType);
    ManifestWriter<DataFile> writer = ManifestFiles.write(format, spec, outputFile, null);
    try {
        for (int index = startIndex; index < endIndex; index++) {
            Row row = rows.get(index);
            long snapshotId = row.getLong(0);
            long sequenceNumber = row.getLong(1);
            Row file = row.getStruct(2);
            writer.existing(wrapper.wrap(file), snapshotId, sequenceNumber);
        }
    } finally {
        writer.close();
    }
    return writer.toManifestFile();
}
Also used : Path(org.apache.hadoop.fs.Path) OutputFile(org.apache.iceberg.io.OutputFile) DataFile(org.apache.iceberg.DataFile) SparkDataFile(org.apache.iceberg.spark.SparkDataFile) Types(org.apache.iceberg.types.Types) SparkDataFile(org.apache.iceberg.spark.SparkDataFile) Row(org.apache.spark.sql.Row)

Example 27 with Row$

use of org.apache.spark.sql.Row$ in project iceberg by apache.

the class BaseRewriteManifestsSparkAction method doExecute.

private RewriteManifests.Result doExecute() {
    List<ManifestFile> matchingManifests = findMatchingManifests();
    if (matchingManifests.isEmpty()) {
        return BaseRewriteManifestsActionResult.empty();
    }
    long totalSizeBytes = 0L;
    int numEntries = 0;
    for (ManifestFile manifest : matchingManifests) {
        ValidationException.check(hasFileCounts(manifest), "No file counts in manifest: %s", manifest.path());
        totalSizeBytes += manifest.length();
        numEntries += manifest.addedFilesCount() + manifest.existingFilesCount() + manifest.deletedFilesCount();
    }
    int targetNumManifests = targetNumManifests(totalSizeBytes);
    int targetNumManifestEntries = targetNumManifestEntries(numEntries, targetNumManifests);
    Dataset<Row> manifestEntryDF = buildManifestEntryDF(matchingManifests);
    List<ManifestFile> newManifests;
    if (spec.fields().size() < 1) {
        newManifests = writeManifestsForUnpartitionedTable(manifestEntryDF, targetNumManifests);
    } else {
        newManifests = writeManifestsForPartitionedTable(manifestEntryDF, targetNumManifests, targetNumManifestEntries);
    }
    replaceManifests(matchingManifests, newManifests);
    return new BaseRewriteManifestsActionResult(matchingManifests, newManifests);
}
Also used : Row(org.apache.spark.sql.Row) BaseRewriteManifestsActionResult(org.apache.iceberg.actions.BaseRewriteManifestsActionResult) ManifestFile(org.apache.iceberg.ManifestFile)

Example 28 with Row$

use of org.apache.spark.sql.Row$ in project iceberg by apache.

the class Spark3BinPackStrategy method rewriteFiles.

@Override
public Set<DataFile> rewriteFiles(List<FileScanTask> filesToRewrite) {
    String groupID = UUID.randomUUID().toString();
    try {
        manager.stageTasks(table, groupID, filesToRewrite);
        // Disable Adaptive Query Execution as this may change the output partitioning of our write
        SparkSession cloneSession = spark.cloneSession();
        cloneSession.conf().set(SQLConf.ADAPTIVE_EXECUTION_ENABLED().key(), false);
        Dataset<Row> scanDF = cloneSession.read().format("iceberg").option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, groupID).option(SparkReadOptions.SPLIT_SIZE, splitSize(inputFileSize(filesToRewrite))).option(SparkReadOptions.FILE_OPEN_COST, "0").load(table.name());
        // All files within a file group are written with the same spec, so check the first
        boolean requiresRepartition = !filesToRewrite.get(0).spec().equals(table.spec());
        // Invoke a shuffle if the partition spec of the incoming partition does not match the table
        String distributionMode = requiresRepartition ? DistributionMode.RANGE.modeName() : DistributionMode.NONE.modeName();
        // write the packed data into new files where each split becomes a new file
        scanDF.write().format("iceberg").option(SparkWriteOptions.REWRITTEN_FILE_SCAN_TASK_SET_ID, groupID).option(SparkWriteOptions.TARGET_FILE_SIZE_BYTES, writeMaxFileSize()).option(SparkWriteOptions.DISTRIBUTION_MODE, distributionMode).mode("append").save(table.name());
        return rewriteCoordinator.fetchNewDataFiles(table, groupID);
    } finally {
        manager.removeTasks(table, groupID);
        rewriteCoordinator.clearRewrite(table, groupID);
    }
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) Row(org.apache.spark.sql.Row)

Example 29 with Row$

use of org.apache.spark.sql.Row$ in project iceberg by apache.

the class TestCreateActions method validateTables.

private void validateTables(String source, String dest) throws NoSuchTableException, ParseException {
    List<Row> expected = spark.table(source).collectAsList();
    SparkTable destTable = loadTable(dest);
    Assert.assertEquals("Provider should be iceberg", "iceberg", destTable.properties().get(TableCatalog.PROP_PROVIDER));
    List<Row> actual = spark.table(dest).collectAsList();
    Assert.assertTrue(String.format("Rows in migrated table did not match\nExpected :%s rows \nFound    :%s", expected, actual), expected.containsAll(actual) && actual.containsAll(expected));
}
Also used : Row(org.apache.spark.sql.Row) SparkTable(org.apache.iceberg.spark.source.SparkTable)

Example 30 with Row$

use of org.apache.spark.sql.Row$ in project iceberg by apache.

the class TestFileRewriteCoordinator method testBinPackRewrite.

@Test
public void testBinPackRewrite() throws NoSuchTableException, IOException {
    sql("CREATE TABLE %s (id INT, data STRING) USING iceberg", tableName);
    Dataset<Row> df = newDF(1000);
    df.coalesce(1).writeTo(tableName).append();
    df.coalesce(1).writeTo(tableName).append();
    df.coalesce(1).writeTo(tableName).append();
    df.coalesce(1).writeTo(tableName).append();
    Table table = validationCatalog.loadTable(tableIdent);
    Assert.assertEquals("Should produce 4 snapshots", 4, Iterables.size(table.snapshots()));
    Dataset<Row> fileDF = spark.read().format("iceberg").load(tableName(tableIdent.name() + ".files"));
    List<Long> fileSizes = fileDF.select("file_size_in_bytes").as(Encoders.LONG()).collectAsList();
    long avgFileSize = fileSizes.stream().mapToLong(i -> i).sum() / fileSizes.size();
    try (CloseableIterable<FileScanTask> fileScanTasks = table.newScan().planFiles()) {
        String fileSetID = UUID.randomUUID().toString();
        FileScanTaskSetManager taskSetManager = FileScanTaskSetManager.get();
        taskSetManager.stageTasks(table, fileSetID, Lists.newArrayList(fileScanTasks));
        // read and pack original 4 files into 2 splits
        Dataset<Row> scanDF = spark.read().format("iceberg").option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, fileSetID).option(SparkReadOptions.SPLIT_SIZE, Long.toString(avgFileSize * 2)).option(SparkReadOptions.FILE_OPEN_COST, "0").load(tableName);
        // write the packed data into new files where each split becomes a new file
        scanDF.writeTo(tableName).option(SparkWriteOptions.REWRITTEN_FILE_SCAN_TASK_SET_ID, fileSetID).append();
        // commit the rewrite
        FileRewriteCoordinator rewriteCoordinator = FileRewriteCoordinator.get();
        Set<DataFile> rewrittenFiles = taskSetManager.fetchTasks(table, fileSetID).stream().map(FileScanTask::file).collect(Collectors.toSet());
        Set<DataFile> addedFiles = rewriteCoordinator.fetchNewDataFiles(table, fileSetID);
        table.newRewrite().rewriteFiles(rewrittenFiles, addedFiles).commit();
    }
    table.refresh();
    Map<String, String> summary = table.currentSnapshot().summary();
    Assert.assertEquals("Deleted files count must match", "4", summary.get("deleted-data-files"));
    Assert.assertEquals("Added files count must match", "2", summary.get("added-data-files"));
    Object rowCount = scalarSql("SELECT count(*) FROM %s", tableName);
    Assert.assertEquals("Row count must match", 4000L, rowCount);
}
Also used : Table(org.apache.iceberg.Table) DataFile(org.apache.iceberg.DataFile) Row(org.apache.spark.sql.Row) FileScanTask(org.apache.iceberg.FileScanTask) Test(org.junit.Test)

Aggregations

Row (org.apache.spark.sql.Row)1045 Test (org.junit.Test)344 ArrayList (java.util.ArrayList)244 SparkSession (org.apache.spark.sql.SparkSession)243 StructType (org.apache.spark.sql.types.StructType)215 Test (org.junit.jupiter.api.Test)157 StructField (org.apache.spark.sql.types.StructField)138 Table (org.apache.iceberg.Table)127 Dataset (org.apache.spark.sql.Dataset)123 List (java.util.List)115 Script (org.apache.sysml.api.mlcontext.Script)104 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)101 IOException (java.io.IOException)78 Column (org.apache.spark.sql.Column)78 File (java.io.File)76 Collectors (java.util.stream.Collectors)73 PartitionSpec (org.apache.iceberg.PartitionSpec)70 DatasetBuilder (au.csiro.pathling.test.builders.DatasetBuilder)66 Map (java.util.Map)66 HadoopTables (org.apache.iceberg.hadoop.HadoopTables)61