Search in sources :

Example 1 with DataSourceOptions

use of org.apache.spark.sql.sources.v2.DataSourceOptions in project spark-bigquery-connector by GoogleCloudDataproc.

the class SparkBigQueryConfigTest method testConfigFromGlobalOptions.

@Test
public void testConfigFromGlobalOptions() {
    Configuration hadoopConfiguration = new Configuration();
    DataSourceOptions options = new DataSourceOptions(ImmutableMap.<String, String>builder().put("table", "dataset.table").build());
    ImmutableMap<String, String> globalOptions = ImmutableMap.<String, String>builder().put("viewsEnabled", "true").put("spark.datasource.bigquery.temporaryGcsBucket", "bucket").put("bqStorageReadEndpoint", "ep").put("bqEncodedCreateReadSessionRequest", "ec").put("writeMethod", "direct").build();
    SparkBigQueryConfig config = SparkBigQueryConfig.from(options.asMap(), globalOptions, hadoopConfiguration, DEFAULT_PARALLELISM, new SQLConf(), SPARK_VERSION, Optional.empty());
    assertThat(config.isViewsEnabled()).isTrue();
    assertThat(config.getTemporaryGcsBucket()).isEqualTo(Optional.of("bucket"));
    assertThat(config.toReadSessionCreatorConfig().endpoint().get()).isEqualTo("ep");
    assertThat(config.toReadSessionCreatorConfig().getRequestEncodedBase().get()).isEqualTo("ec");
    assertThat(config.getWriteMethod()).isEqualTo(SparkBigQueryConfig.WriteMethod.DIRECT);
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) DataSourceOptions(org.apache.spark.sql.sources.v2.DataSourceOptions) SQLConf(org.apache.spark.sql.internal.SQLConf) Test(org.junit.Test)

Example 2 with DataSourceOptions

use of org.apache.spark.sql.sources.v2.DataSourceOptions in project spark-bigquery-connector by GoogleCloudDataproc.

the class SparkBigQueryConfigTest method testDefaults.

@Test
public void testDefaults() {
    Configuration hadoopConfiguration = new Configuration();
    DataSourceOptions options = new DataSourceOptions(defaultOptions);
    SparkBigQueryConfig config = SparkBigQueryConfig.from(options.asMap(), ImmutableMap.of(), hadoopConfiguration, DEFAULT_PARALLELISM, new SQLConf(), SPARK_VERSION, Optional.empty());
    assertThat(config.getTableId()).isEqualTo(TableId.of("dataset", "table"));
    assertThat(config.getFilter()).isEqualTo(Optional.empty());
    assertThat(config.getSchema()).isEqualTo(Optional.empty());
    assertThat(config.getMaxParallelism()).isEqualTo(OptionalInt.empty());
    assertThat(config.getTemporaryGcsBucket()).isEqualTo(Optional.empty());
    assertThat(config.getIntermediateFormat()).isEqualTo(SparkBigQueryConfig.DEFAULT_INTERMEDIATE_FORMAT);
    assertThat(config.getReadDataFormat()).isEqualTo(SparkBigQueryConfig.DEFAULT_READ_DATA_FORMAT);
    assertThat(config.getMaterializationProject()).isEqualTo(Optional.empty());
    assertThat(config.getMaterializationDataset()).isEqualTo(Optional.empty());
    assertThat(config.getPartitionField()).isEqualTo(Optional.empty());
    assertThat(config.getPartitionExpirationMs()).isEqualTo(OptionalLong.empty());
    assertThat(config.getPartitionRequireFilter()).isEqualTo(Optional.empty());
    assertThat(config.getPartitionType()).isEqualTo(Optional.empty());
    assertThat(config.getClusteredFields()).isEqualTo(Optional.empty());
    assertThat(config.getCreateDisposition()).isEqualTo(Optional.empty());
    assertThat(config.getLoadSchemaUpdateOptions()).isEqualTo(ImmutableList.of());
    assertThat(config.getMaterializationExpirationTimeInMinutes()).isEqualTo(24 * 60);
    assertThat(config.getMaxReadRowsRetries()).isEqualTo(3);
    assertThat(config.isUseAvroLogicalTypes()).isFalse();
    assertThat(config.getBigQueryClientConnectTimeout()).isEqualTo(60 * 1000);
    assertThat(config.getBigQueryClientReadTimeout()).isEqualTo(60 * 1000);
    assertThat(config.getBigQueryClientRetrySettings().getMaxAttempts()).isEqualTo(10);
    assertThat(config.getArrowCompressionCodec()).isEqualTo(CompressionCodec.COMPRESSION_UNSPECIFIED);
    assertThat(config.getWriteMethod()).isEqualTo(SparkBigQueryConfig.WriteMethod.INDIRECT);
    assertThat(config.getCacheExpirationTimeInMinutes()).isEqualTo(SparkBigQueryConfig.DEFAULT_CACHE_EXPIRATION_IN_MINUTES);
    assertThat(config.getTraceId().isPresent()).isFalse();
    assertThat(config.getBigQueryJobLabels()).isEmpty();
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) DataSourceOptions(org.apache.spark.sql.sources.v2.DataSourceOptions) SQLConf(org.apache.spark.sql.internal.SQLConf) Test(org.junit.Test)

Example 3 with DataSourceOptions

use of org.apache.spark.sql.sources.v2.DataSourceOptions in project spark-bigquery-connector by GoogleCloudDataproc.

the class SparkBigQueryConfigTest method testInvalidCompressionCodec.

@Test
public void testInvalidCompressionCodec() {
    Configuration hadoopConfiguration = new Configuration();
    DataSourceOptions options = new DataSourceOptions(ImmutableMap.<String, String>builder().put("table", "test_t").put("dataset", "test_d").put("project", "test_p").put("arrowCompressionCodec", "randomCompression").build());
    IllegalArgumentException exception = Assert.assertThrows(IllegalArgumentException.class, () -> SparkBigQueryConfig.from(options.asMap(), ImmutableMap.of(), hadoopConfiguration, DEFAULT_PARALLELISM, new SQLConf(), SPARK_VERSION, Optional.empty()));
    assertThat(exception).hasMessageThat().contains("Compression codec 'RANDOMCOMPRESSION' for Arrow is not supported." + " Supported formats are " + Arrays.toString(CompressionCodec.values()));
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) DataSourceOptions(org.apache.spark.sql.sources.v2.DataSourceOptions) SQLConf(org.apache.spark.sql.internal.SQLConf) Test(org.junit.Test)

Example 4 with DataSourceOptions

use of org.apache.spark.sql.sources.v2.DataSourceOptions in project spark-bigquery-connector by GoogleCloudDataproc.

the class SparkBigQueryConfigTest method testSerializability.

// "project", "test_project"); // to remove the need for default project
@Test
public void testSerializability() throws IOException {
    Configuration hadoopConfiguration = new Configuration();
    DataSourceOptions options = new DataSourceOptions(defaultOptions);
    // test to make sure all members can be serialized.
    new ObjectOutputStream(new ByteArrayOutputStream()).writeObject(SparkBigQueryConfig.from(options.asMap(), ImmutableMap.of(), hadoopConfiguration, DEFAULT_PARALLELISM, new SQLConf(), SPARK_VERSION, Optional.empty()));
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) DataSourceOptions(org.apache.spark.sql.sources.v2.DataSourceOptions) ByteArrayOutputStream(java.io.ByteArrayOutputStream) ObjectOutputStream(java.io.ObjectOutputStream) SQLConf(org.apache.spark.sql.internal.SQLConf) Test(org.junit.Test)

Example 5 with DataSourceOptions

use of org.apache.spark.sql.sources.v2.DataSourceOptions in project spark-bigquery-connector by GoogleCloudDataproc.

the class SparkBigQueryConfigTest method testGetTableIdWithoutThePartition_PartitionExists.

@Test
public void testGetTableIdWithoutThePartition_PartitionExists() {
    Configuration hadoopConfiguration = new Configuration();
    DataSourceOptions options = new DataSourceOptions(ImmutableMap.of("table", "dataset.table", "datePartition", "20201010"));
    SparkBigQueryConfig config = SparkBigQueryConfig.from(options.asMap(), ImmutableMap.of(), hadoopConfiguration, DEFAULT_PARALLELISM, new SQLConf(), SPARK_VERSION, Optional.empty());
    assertThat(config.getTableId().getTable()).isEqualTo("table$20201010");
    assertThat(config.getTableIdWithoutThePartition().getTable()).isEqualTo("table");
    assertThat(config.getTableIdWithoutThePartition().getDataset()).isEqualTo(config.getTableId().getDataset());
    assertThat(config.getTableIdWithoutThePartition().getProject()).isEqualTo(config.getTableId().getProject());
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) DataSourceOptions(org.apache.spark.sql.sources.v2.DataSourceOptions) SQLConf(org.apache.spark.sql.internal.SQLConf) Test(org.junit.Test)

Aggregations

DataSourceOptions (org.apache.spark.sql.sources.v2.DataSourceOptions)38 Test (org.junit.Test)33 HashMap (java.util.HashMap)13 Configuration (org.apache.hadoop.conf.Configuration)13 SQLConf (org.apache.spark.sql.internal.SQLConf)10 ArrayList (java.util.ArrayList)4 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)4 Row (org.apache.spark.sql.Row)4 InternalRow (org.apache.spark.sql.catalyst.InternalRow)4 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)4 MethodSource (org.junit.jupiter.params.provider.MethodSource)4 List (java.util.List)3 DataSourceReader (org.apache.spark.sql.sources.v2.reader.DataSourceReader)3 Layout (io.tiledb.java.api.Layout)2 ByteArrayOutputStream (java.io.ByteArrayOutputStream)2 File (java.io.File)2 ObjectOutputStream (java.io.ObjectOutputStream)2 URI (java.net.URI)2 DataFile (org.apache.iceberg.DataFile)2 Table (org.apache.iceberg.Table)2