Search in sources :

Example 1 with SimpleFileIOOutputProperties

use of org.talend.components.simplefileio.output.SimpleFileIOOutputProperties in project components by Talend.

the class SimpleFileIOOutputErrorTest method testTryToOverwrite.

/**
 * Basic unit test using all default values (except for the path) on an in-memory DFS cluster.
 */
@Test
public void testTryToOverwrite() throws IOException, URISyntaxException {
    Path parent = new Path(mini.newFolder().toString());
    Path dst = new Path(parent, "output");
    String fileSpec = mini.getLocalFs().getUri().resolve(dst.toUri()).toString();
    // Write something to the file before trying to run.
    try (OutputStream out = mini.getLocalFs().create(new Path(dst, "part-00000"))) {
        out.write(0);
    }
    // Trying to write to an existing destination throws an exception.
    thrown.expect(TalendRuntimeException.class);
    thrown.expect(hasProperty("code", is(SimpleFileIOErrorCode.OUTPUT_ALREADY_EXISTS)));
    thrown.expectMessage("The path " + fileSpec + " already exists. Please remove it manually.");
    // Now try using the component.
    try {
        // Configure the component.
        SimpleFileIOOutputProperties props = SimpleFileIOOutputRuntimeTest.createOutputComponentProperties();
        props.getDatasetProperties().path.setValue(fileSpec);
        // Create the runtime.
        SimpleFileIOOutputRuntime runtime = new SimpleFileIOOutputRuntime();
        runtime.initialize(null, props);
        // Use the runtime in a direct pipeline to test.
        final Pipeline p = beam.createPipeline();
        PCollection<IndexedRecord> input = // 
        p.apply(// 
        Create.of(// 
        ConvertToIndexedRecord.convertToAvro(new String[] { "1", "one" }), // 
        ConvertToIndexedRecord.convertToAvro(new String[] { "2", "two" })));
        input.apply(runtime);
        // And run the test.
        p.run().waitUntilFinish();
    } catch (Pipeline.PipelineExecutionException e) {
        if (e.getCause() instanceof TalendRuntimeException)
            throw (TalendRuntimeException) e.getCause();
        throw e;
    }
}
Also used : Path(org.apache.hadoop.fs.Path) TalendRuntimeException(org.talend.daikon.exception.TalendRuntimeException) SimpleFileIOOutputProperties(org.talend.components.simplefileio.output.SimpleFileIOOutputProperties) ConvertToIndexedRecord(org.talend.components.adapter.beam.transform.ConvertToIndexedRecord) IndexedRecord(org.apache.avro.generic.IndexedRecord) OutputStream(java.io.OutputStream) Pipeline(org.apache.beam.sdk.Pipeline) Test(org.junit.Test)

Example 2 with SimpleFileIOOutputProperties

use of org.talend.components.simplefileio.output.SimpleFileIOOutputProperties in project components by Talend.

the class SimpleFileIOOutputErrorTest method testUnauthorizedOverwrite.

/**
 * Basic unit test using all default values (except for the path) on an in-memory DFS cluster.
 */
@Test
public void testUnauthorizedOverwrite() throws IOException, URISyntaxException {
    Path parent = new Path(mini.newFolder().toString());
    Path dst = new Path(parent, "output");
    String fileSpec = mini.getLocalFs().getUri().resolve(dst.toUri()).toString();
    // Write something to the file before trying to run.
    try (OutputStream out = mini.getLocalFs().create(new Path(dst, "part-00000"))) {
        out.write(0);
    }
    // Ensure that the destination is unwritable.
    FileUtil.chmod(dst.toUri().toString(), "000", true);
    // Trying to overwrite an unmodifiable destination throws an exception.
    thrown.expect(TalendRuntimeException.class);
    thrown.expect(hasProperty("code", is(SimpleFileIOErrorCode.OUTPUT_NOT_AUTHORIZED)));
    thrown.expectMessage("Can not write to " + fileSpec + ". Please check user permissions or existence of base directory.");
    // Now try using the component.
    try {
        // Configure the component.
        SimpleFileIOOutputProperties props = SimpleFileIOOutputRuntimeTest.createOutputComponentProperties();
        props.getDatasetProperties().path.setValue(fileSpec);
        props.overwrite.setValue(true);
        // Create the runtime.
        SimpleFileIOOutputRuntime runtime = new SimpleFileIOOutputRuntime();
        runtime.initialize(null, props);
        // Use the runtime in a direct pipeline to test.
        final Pipeline p = beam.createPipeline();
        PCollection<IndexedRecord> input = // 
        p.apply(// 
        Create.of(// 
        ConvertToIndexedRecord.convertToAvro(new String[] { "1", "one" }), // 
        ConvertToIndexedRecord.convertToAvro(new String[] { "2", "two" })));
        input.apply(runtime);
        // And run the test.
        runtime.runAtDriver(null);
        p.run().waitUntilFinish();
    } catch (Pipeline.PipelineExecutionException e) {
        if (e.getCause() instanceof TalendRuntimeException)
            throw (TalendRuntimeException) e.getCause();
        throw e;
    }
}
Also used : Path(org.apache.hadoop.fs.Path) TalendRuntimeException(org.talend.daikon.exception.TalendRuntimeException) SimpleFileIOOutputProperties(org.talend.components.simplefileio.output.SimpleFileIOOutputProperties) ConvertToIndexedRecord(org.talend.components.adapter.beam.transform.ConvertToIndexedRecord) IndexedRecord(org.apache.avro.generic.IndexedRecord) OutputStream(java.io.OutputStream) Pipeline(org.apache.beam.sdk.Pipeline) Test(org.junit.Test)

Example 3 with SimpleFileIOOutputProperties

use of org.talend.components.simplefileio.output.SimpleFileIOOutputProperties in project components by Talend.

the class SimpleFileIORoundTripRuntimeTest method testParquet.

/**
 * Basic Parquet test.
 */
@Test
public void testParquet() throws IOException {
    // The file that we will be creating.
    RecordSet rs = getSimpleTestData(0);
    String fileSpec = mini.getLocalFsNewFolder() + "output/";
    // Configure the components.
    SimpleFileIOOutputProperties outputProps = createOutputComponentProperties();
    outputProps.getDatasetProperties().format.setValue(SimpleFileIOFormat.PARQUET);
    outputProps.getDatasetProperties().path.setValue(fileSpec);
    SimpleFileIOInputProperties inputProps = createInputComponentProperties();
    inputProps.setDatasetProperties(outputProps.getDatasetProperties());
    List<IndexedRecord> actual = runRoundTripPipelines(beam, rs.getAllData(), outputProps, inputProps);
    // Generate the set of expected records. By default, CSV turns all columns into String and loses the original
    // column name.
    List<IndexedRecord> expected = rs.getAllData();
    assertThat(actual, containsInAnyOrder(expected.toArray()));
// Verify that the file on the filesystem was correctly written.
// TODO(rskraba): verify independently from
// mini.assertReadFile(
// mini.getLocalFs(),
// fileSpec,
// rewriteRecordsAsCsvLines(expected, inputProps.getDatasetProperties().recordDelimiter.getValue(),
// inputProps.getDatasetProperties().fieldDelimiter.getValue()));
}
Also used : SimpleFileIOOutputProperties(org.talend.components.simplefileio.output.SimpleFileIOOutputProperties) ConvertToIndexedRecord(org.talend.components.adapter.beam.transform.ConvertToIndexedRecord) IndexedRecord(org.apache.avro.generic.IndexedRecord) SimpleFileIOInputProperties(org.talend.components.simplefileio.input.SimpleFileIOInputProperties) RecordSet(org.talend.components.test.RecordSet) Test(org.junit.Test)

Example 4 with SimpleFileIOOutputProperties

use of org.talend.components.simplefileio.output.SimpleFileIOOutputProperties in project components by Talend.

the class SparkSimpleFileIOOutputRuntimeTestIT method testAvro_merge.

@Test
public void testAvro_merge() throws IOException {
    FileSystem fs = FileSystem.get(spark.createHadoopConfiguration());
    String fileSpec = fs.getUri().resolve(new Path(tmp.getRoot().toString(), "output.avro").toUri()).toString();
    // Configure the component.
    SimpleFileIOOutputProperties props = SimpleFileIOOutputRuntimeTest.createOutputComponentProperties();
    props.getDatasetProperties().path.setValue(fileSpec);
    props.getDatasetProperties().format.setValue(SimpleFileIOFormat.AVRO);
    props.mergeOutput.setValue(true);
    // Create the runtime.
    SimpleFileIOOutputRuntime runtime = new SimpleFileIOOutputRuntime();
    runtime.initialize(null, props);
    // Use the runtime in a Spark pipeline to test.
    final Pipeline p = spark.createPipeline();
    PCollection<IndexedRecord> input = // 
    p.apply(// 
    Create.of(// 
    ConvertToIndexedRecord.convertToAvro(new String[] { "1", "one" }), // 
    ConvertToIndexedRecord.convertToAvro(new String[] { "2", "two" })));
    input.apply(runtime);
    // And run the test.
    p.run().waitUntilFinish();
    // Check the expected values.
    MiniDfsResource.assertReadAvroFile(fs, fileSpec, new HashSet<IndexedRecord>(// 
    Arrays.asList(// 
    ConvertToIndexedRecord.convertToAvro(new String[] { "1", "one" }), ConvertToIndexedRecord.convertToAvro(new String[] { "2", "two" }))), false);
    MiniDfsResource.assertFileNumber(fs, fileSpec, 1);
}
Also used : Path(org.apache.hadoop.fs.Path) SimpleFileIOOutputProperties(org.talend.components.simplefileio.output.SimpleFileIOOutputProperties) ConvertToIndexedRecord(org.talend.components.adapter.beam.transform.ConvertToIndexedRecord) IndexedRecord(org.apache.avro.generic.IndexedRecord) FileSystem(org.apache.hadoop.fs.FileSystem) Pipeline(org.apache.beam.sdk.Pipeline) Test(org.junit.Test)

Example 5 with SimpleFileIOOutputProperties

use of org.talend.components.simplefileio.output.SimpleFileIOOutputProperties in project components by Talend.

the class SparkSimpleFileIOOutputRuntimeTestIT method testParquet_merge.

@Test
public void testParquet_merge() throws IOException {
    FileSystem fs = FileSystem.get(spark.createHadoopConfiguration());
    String fileSpec = fs.getUri().resolve(new Path(tmp.getRoot().toString(), "output.parquet").toUri()).toString();
    // Configure the component.
    SimpleFileIOOutputProperties props = SimpleFileIOOutputRuntimeTest.createOutputComponentProperties();
    props.getDatasetProperties().path.setValue(fileSpec);
    props.getDatasetProperties().format.setValue(SimpleFileIOFormat.PARQUET);
    props.mergeOutput.setValue(true);
    // Create the runtime.
    SimpleFileIOOutputRuntime runtime = new SimpleFileIOOutputRuntime();
    runtime.initialize(null, props);
    // Use the runtime in a Spark pipeline to test.
    final Pipeline p = spark.createPipeline();
    PCollection<IndexedRecord> input = // 
    p.apply(// 
    Create.of(// 
    ConvertToIndexedRecord.convertToAvro(new String[] { "1", "one" }), // 
    ConvertToIndexedRecord.convertToAvro(new String[] { "2", "two" })));
    input.apply(runtime);
    // And run the test.
    p.run().waitUntilFinish();
    // Check the expected values.
    MiniDfsResource.assertReadParquetFile(fs, fileSpec, new HashSet<IndexedRecord>(// 
    Arrays.asList(// 
    ConvertToIndexedRecord.convertToAvro(new String[] { "1", "one" }), ConvertToIndexedRecord.convertToAvro(new String[] { "2", "two" }))), false);
    MiniDfsResource.assertFileNumber(fs, fileSpec, 1);
}
Also used : Path(org.apache.hadoop.fs.Path) SimpleFileIOOutputProperties(org.talend.components.simplefileio.output.SimpleFileIOOutputProperties) ConvertToIndexedRecord(org.talend.components.adapter.beam.transform.ConvertToIndexedRecord) IndexedRecord(org.apache.avro.generic.IndexedRecord) FileSystem(org.apache.hadoop.fs.FileSystem) Pipeline(org.apache.beam.sdk.Pipeline) Test(org.junit.Test)

Aggregations

SimpleFileIOOutputProperties (org.talend.components.simplefileio.output.SimpleFileIOOutputProperties)20 Test (org.junit.Test)18 IndexedRecord (org.apache.avro.generic.IndexedRecord)17 ConvertToIndexedRecord (org.talend.components.adapter.beam.transform.ConvertToIndexedRecord)17 Pipeline (org.apache.beam.sdk.Pipeline)13 Path (org.apache.hadoop.fs.Path)13 SimpleFileIOInputProperties (org.talend.components.simplefileio.input.SimpleFileIOInputProperties)6 OutputStream (java.io.OutputStream)4 FileSystem (org.apache.hadoop.fs.FileSystem)4 RecordSet (org.talend.components.test.RecordSet)4 TalendRuntimeException (org.talend.daikon.exception.TalendRuntimeException)4 ArrayList (java.util.ArrayList)2 InputStream (java.io.InputStream)1 Schema (org.apache.avro.Schema)1 Ignore (org.junit.Ignore)1 Category (org.junit.experimental.categories.Category)1 SimpleFileIODatasetRuntimeTest (org.talend.components.simplefileio.runtime.SimpleFileIODatasetRuntimeTest)1 SimpleFileIOInputRuntime (org.talend.components.simplefileio.runtime.SimpleFileIOInputRuntime)1 SimpleFileIOOutputRuntime (org.talend.components.simplefileio.runtime.SimpleFileIOOutputRuntime)1