use of org.apache.spark.sql.internal.SQLConf in project spark-bigquery-connector by GoogleCloudDataproc.
the class SparkBigQueryConfigTest method testSerializability.
// "project", "test_project"); // to remove the need for default project
@Test
public void testSerializability() throws IOException {
Configuration hadoopConfiguration = new Configuration();
DataSourceOptions options = new DataSourceOptions(defaultOptions);
// test to make sure all members can be serialized.
new ObjectOutputStream(new ByteArrayOutputStream()).writeObject(SparkBigQueryConfig.from(options.asMap(), ImmutableMap.of(), hadoopConfiguration, DEFAULT_PARALLELISM, new SQLConf(), SPARK_VERSION, Optional.empty()));
}
use of org.apache.spark.sql.internal.SQLConf in project spark-bigquery-connector by GoogleCloudDataproc.
the class SparkBigQueryConfigTest method testGetTableIdWithoutThePartition_PartitionExists.
@Test
public void testGetTableIdWithoutThePartition_PartitionExists() {
Configuration hadoopConfiguration = new Configuration();
DataSourceOptions options = new DataSourceOptions(ImmutableMap.of("table", "dataset.table", "datePartition", "20201010"));
SparkBigQueryConfig config = SparkBigQueryConfig.from(options.asMap(), ImmutableMap.of(), hadoopConfiguration, DEFAULT_PARALLELISM, new SQLConf(), SPARK_VERSION, Optional.empty());
assertThat(config.getTableId().getTable()).isEqualTo("table$20201010");
assertThat(config.getTableIdWithoutThePartition().getTable()).isEqualTo("table");
assertThat(config.getTableIdWithoutThePartition().getDataset()).isEqualTo(config.getTableId().getDataset());
assertThat(config.getTableIdWithoutThePartition().getProject()).isEqualTo(config.getTableId().getProject());
}
use of org.apache.spark.sql.internal.SQLConf in project spark-bigquery-connector by GoogleCloudDataproc.
the class SparkBigQueryProxyAndHttpConfigTest method testConfigViaSparkBigQueryConfigWithHadoopConfiguration.
@Test
public void testConfigViaSparkBigQueryConfigWithHadoopConfiguration() throws URISyntaxException {
HashMap<String, String> sparkConfigOptions = new HashMap<>();
sparkConfigOptions.put("table", "dataset.table");
DataSourceOptions options = new DataSourceOptions(sparkConfigOptions);
SparkBigQueryConfig sparkConfig = SparkBigQueryConfig.from(// contains only one key "table"
options.asMap(), // empty global options,
ImmutableMap.of(), defaultHadoopConfiguration, 10, new SQLConf(), "2.4.0", Optional.empty());
SparkBigQueryProxyAndHttpConfig config = (SparkBigQueryProxyAndHttpConfig) sparkConfig.getBigQueryProxyConfig();
assertThat(config.getProxyUri()).isEqualTo(Optional.of(getURI("http", "bq-connector-host-hadoop", 1234)));
assertThat(config.getProxyUsername()).isEqualTo(Optional.of("bq-connector-user-hadoop"));
assertThat(config.getProxyPassword()).isEqualTo(Optional.of("bq-connector-password-hadoop"));
assertThat(config.getHttpMaxRetry()).isEqualTo(Optional.of(30));
assertThat(config.getHttpConnectTimeout()).isEqualTo(Optional.of(30000));
assertThat(config.getHttpReadTimeout()).isEqualTo(Optional.of(40000));
}
use of org.apache.spark.sql.internal.SQLConf in project iceberg by apache.
the class Spark3SortStrategy method rewriteFiles.
@Override
public Set<DataFile> rewriteFiles(List<FileScanTask> filesToRewrite) {
String groupID = UUID.randomUUID().toString();
boolean requiresRepartition = !filesToRewrite.get(0).spec().equals(table.spec());
SortOrder[] ordering;
if (requiresRepartition) {
// Build in the requirement for Partition Sorting into our sort order
ordering = SparkDistributionAndOrderingUtil.convert(SortOrderUtil.buildSortOrder(table, sortOrder()));
} else {
ordering = SparkDistributionAndOrderingUtil.convert(sortOrder());
}
Distribution distribution = Distributions.ordered(ordering);
try {
manager.stageTasks(table, groupID, filesToRewrite);
// Disable Adaptive Query Execution as this may change the output partitioning of our write
SparkSession cloneSession = spark.cloneSession();
cloneSession.conf().set(SQLConf.ADAPTIVE_EXECUTION_ENABLED().key(), false);
// Reset Shuffle Partitions for our sort
long numOutputFiles = numOutputFiles((long) (inputFileSize(filesToRewrite) * sizeEstimateMultiple));
cloneSession.conf().set(SQLConf.SHUFFLE_PARTITIONS().key(), Math.max(1, numOutputFiles));
Dataset<Row> scanDF = cloneSession.read().format("iceberg").option(SparkReadOptions.FILE_SCAN_TASK_SET_ID, groupID).load(table.name());
// write the packed data into new files where each split becomes a new file
SQLConf sqlConf = cloneSession.sessionState().conf();
LogicalPlan sortPlan = sortPlan(distribution, ordering, scanDF.logicalPlan(), sqlConf);
Dataset<Row> sortedDf = new Dataset<>(cloneSession, sortPlan, scanDF.encoder());
sortedDf.write().format("iceberg").option(SparkWriteOptions.REWRITTEN_FILE_SCAN_TASK_SET_ID, groupID).option(SparkWriteOptions.TARGET_FILE_SIZE_BYTES, writeMaxFileSize()).option(SparkWriteOptions.USE_TABLE_DISTRIBUTION_AND_ORDERING, "false").mode(// This will only write files without modifying the table, see SparkWrite.RewriteFiles
"append").save(table.name());
return rewriteCoordinator.fetchNewDataFiles(table, groupID);
} finally {
manager.removeTasks(table, groupID);
rewriteCoordinator.clearRewrite(table, groupID);
}
}
use of org.apache.spark.sql.internal.SQLConf in project iceberg by apache.
the class SparkTestBase method withSQLConf.
protected void withSQLConf(Map<String, String> conf, Action action) {
SQLConf sqlConf = SQLConf.get();
Map<String, String> currentConfValues = Maps.newHashMap();
conf.keySet().forEach(confKey -> {
if (sqlConf.contains(confKey)) {
String currentConfValue = sqlConf.getConfString(confKey);
currentConfValues.put(confKey, currentConfValue);
}
});
conf.forEach((confKey, confValue) -> {
if (SQLConf.isStaticConfigKey(confKey)) {
throw new RuntimeException("Cannot modify the value of a static config: " + confKey);
}
sqlConf.setConfString(confKey, confValue);
});
try {
action.invoke();
} finally {
conf.forEach((confKey, confValue) -> {
if (currentConfValues.containsKey(confKey)) {
sqlConf.setConfString(confKey, currentConfValues.get(confKey));
} else {
sqlConf.unsetConf(confKey);
}
});
}
}
Aggregations