Search in sources :

Example 11 with SimpleFileIOInputProperties

use of org.talend.components.simplefileio.input.SimpleFileIOInputProperties in project components by Talend.

the class SimpleFileIOInputRuntimeTest method testInputParquetByteBufferSerialization.

/**
 * Test to read an Parquet input and dump it on CSV. This is a special case to see the support of ByteBuffer
 * coding/decoding. This test is currently not working due to log on the Beam class ExecutorServiceParallelExecutor,
 * that will move the offset of any ByteBuffer.
 */
@Test
public void testInputParquetByteBufferSerialization() throws IOException, URISyntaxException {
    InputStream in = getClass().getResourceAsStream("two_lines.snappy.parquet");
    try (OutputStream inOnMinFS = mini.getFs().create(new Path("/user/test/two_lines.snappy.parquet"))) {
        inOnMinFS.write(IOUtils.toByteArray(in));
    }
    String fileSpec = mini.getFs().getUri().resolve("/user/test/two_lines.snappy.parquet").toString();
    String fileSpecOutput = mini.getLocalFs().getUri().resolve(new Path(mini.newFolder().toString(), "output.csv").toUri()).toString();
    // Configure the component.
    SimpleFileIOInputProperties inputProps = createInputComponentProperties();
    inputProps.getDatasetProperties().format.setValue(SimpleFileIOFormat.PARQUET);
    inputProps.getDatasetProperties().path.setValue(fileSpec);
    // Create the runtime.
    SimpleFileIOInputRuntime runtime = new SimpleFileIOInputRuntime();
    runtime.initialize(null, inputProps);
    SimpleFileIOOutputProperties outputProps = new SimpleFileIOOutputProperties(null);
    outputProps.init();
    outputProps.setDatasetProperties(SimpleFileIODatasetRuntimeTest.createDatasetProperties());
    outputProps.getDatasetProperties().path.setValue(fileSpecOutput);
    outputProps.getDatasetProperties().format.setValue(SimpleFileIOFormat.CSV);
    SimpleFileIOOutputRuntime runtimeO = new SimpleFileIOOutputRuntime();
    runtimeO.initialize(null, outputProps);
    // Use the runtime in a direct pipeline to test.
    final Pipeline p = beam.createPipeline(1);
    p.apply(runtime).apply(runtimeO);
    p.run().waitUntilFinish();
    mini.assertReadFile(mini.getLocalFs(), fileSpecOutput, "1;rdubois", "2;clombard");
}
Also used : Path(org.apache.hadoop.fs.Path) SimpleFileIOOutputProperties(org.talend.components.simplefileio.output.SimpleFileIOOutputProperties) InputStream(java.io.InputStream) OutputStream(java.io.OutputStream) SimpleFileIOInputProperties(org.talend.components.simplefileio.input.SimpleFileIOInputProperties) Pipeline(org.apache.beam.sdk.Pipeline) Test(org.junit.Test)

Example 12 with SimpleFileIOInputProperties

use of org.talend.components.simplefileio.input.SimpleFileIOInputProperties in project components by Talend.

the class GSRoundTripRuntimeTestIT method testCsv.

@Test
public void testCsv() {
    List<IndexedRecord> expected = new ArrayList<>();
    expected.add(ConvertToIndexedRecord.convertToAvro(new String[] { "1", "one" }));
    expected.add(ConvertToIndexedRecord.convertToAvro(new String[] { "2", "two" }));
    SimpleFileIOOutputProperties outputProps = createOutputProps();
    outputProps.getDatasetProperties().path.setValue(gsPath);
    SimpleFileIOOutputRuntime outputRuntime = new SimpleFileIOOutputRuntime();
    outputRuntime.initialize(null, outputProps);
    PCollection<IndexedRecord> input = writeP.apply(Create.of(expected));
    input.apply(outputRuntime);
    writeP.run(pipelineOptions).waitUntilFinish();
    SimpleFileIOInputProperties inputProps = createInputProps();
    inputProps.getDatasetProperties().path.setValue(gsPath + "*");
    SimpleFileIOInputRuntime inputRuntime = new SimpleFileIOInputRuntime();
    inputRuntime.initialize(null, inputProps);
    PCollection<IndexedRecord> readRecords = readP.apply(inputRuntime);
    PAssert.that(readRecords).containsInAnyOrder(expected);
    readP.run(pipelineOptions).waitUntilFinish();
}
Also used : SimpleFileIOOutputProperties(org.talend.components.simplefileio.output.SimpleFileIOOutputProperties) SimpleFileIOInputRuntime(org.talend.components.simplefileio.runtime.SimpleFileIOInputRuntime) ConvertToIndexedRecord(org.talend.components.adapter.beam.transform.ConvertToIndexedRecord) IndexedRecord(org.apache.avro.generic.IndexedRecord) ArrayList(java.util.ArrayList) SimpleFileIOInputProperties(org.talend.components.simplefileio.input.SimpleFileIOInputProperties) SimpleFileIOOutputRuntime(org.talend.components.simplefileio.runtime.SimpleFileIOOutputRuntime) Test(org.junit.Test) SimpleFileIODatasetRuntimeTest(org.talend.components.simplefileio.runtime.SimpleFileIODatasetRuntimeTest)

Example 13 with SimpleFileIOInputProperties

use of org.talend.components.simplefileio.input.SimpleFileIOInputProperties in project components by Talend.

the class SimpleFileIOInputErrorTest method testUnauthorizedRead.

/**
 * Basic unit test using all default values (except for the path) on an in-memory DFS cluster.
 */
@Test
public void testUnauthorizedRead() throws IOException, URISyntaxException {
    String inputFile = writeRandomCsvFile(mini.getFs(), "/user/test/input.csv", 0, 0, 10, 10, 6, ";", "\n");
    String fileSpec = mini.getFs().getUri().resolve("/user/test/input.csv").toString();
    Path filePath = new Path(fileSpec);
    // Ensure that the parent is unreadable.
    mini.getFs().setPermission(filePath.getParent(), new FsPermission(FsAction.ALL, FsAction.NONE, FsAction.NONE));
    mini.getFs().setOwner(filePath.getParent(), "gooduser", "gooduser");
    // Configure the component.
    SimpleFileIOInputProperties inputProps = SimpleFileIOInputRuntimeTest.createInputComponentProperties();
    inputProps.getDatasetProperties().path.setValue(fileSpec);
    inputProps.getDatasetProperties().getDatastoreProperties().userName.setValue("baduser");
    // Create the runtime.
    SimpleFileIOInputRuntime runtime = new SimpleFileIOInputRuntime();
    runtime.initialize(null, inputProps);
    // The exception that should be thrown.
    thrown.expect(TalendRuntimeException.class);
    thrown.expect(hasProperty("code", is(SimpleFileIOErrorCode.INPUT_NOT_AUTHORIZED)));
    thrown.expectMessage("baduser can not read from " + fileSpec + ". Please check user permissions or existence of base directory.");
    try {
        // Use the runtime in a direct pipeline to test.
        final Pipeline p = beam.createPipeline();
        PCollection<IndexedRecord> readLines = p.apply(runtime);
        // Check the expected values.
        List<IndexedRecord> expected = new ArrayList<>();
        for (String record : inputFile.split("\n")) {
            expected.add(ConvertToIndexedRecord.convertToAvro(record.split(";")));
        }
        PAssert.that(readLines).containsInAnyOrder(expected);
        // And run the test.
        p.run().waitUntilFinish();
    } catch (Pipeline.PipelineExecutionException e) {
        if (e.getCause() instanceof TalendRuntimeException)
            throw (TalendRuntimeException) e.getCause();
        throw e;
    }
}
Also used : Path(org.apache.hadoop.fs.Path) TalendRuntimeException(org.talend.daikon.exception.TalendRuntimeException) ConvertToIndexedRecord(org.talend.components.adapter.beam.transform.ConvertToIndexedRecord) IndexedRecord(org.apache.avro.generic.IndexedRecord) ArrayList(java.util.ArrayList) SimpleFileIOInputProperties(org.talend.components.simplefileio.input.SimpleFileIOInputProperties) FsPermission(org.apache.hadoop.fs.permission.FsPermission) Pipeline(org.apache.beam.sdk.Pipeline) Test(org.junit.Test)

Example 14 with SimpleFileIOInputProperties

use of org.talend.components.simplefileio.input.SimpleFileIOInputProperties in project components by Talend.

the class SimpleFileIORoundTripRuntimeTest method testCsvWithDelimiters.

/**
 * Test CSV with custom delimiters.
 */
@Test
public void testCsvWithDelimiters() throws IOException {
    // The file that we will be creating.
    RecordSet rs = getSimpleTestData(0);
    String fileSpec = mini.getLocalFsNewFolder() + "output/";
    // Configure the components.
    SimpleFileIOOutputProperties outputProps = createOutputComponentProperties();
    outputProps.getDatasetProperties().format.setValue(SimpleFileIOFormat.CSV);
    outputProps.getDatasetProperties().path.setValue(fileSpec);
    outputProps.getDatasetProperties().recordDelimiter.setValue(RecordDelimiterType.OTHER);
    outputProps.getDatasetProperties().specificRecordDelimiter.setValue("---");
    outputProps.getDatasetProperties().fieldDelimiter.setValue(FieldDelimiterType.OTHER);
    outputProps.getDatasetProperties().specificFieldDelimiter.setValue("|");
    SimpleFileIOInputProperties inputProps = createInputComponentProperties();
    inputProps.setDatasetProperties(outputProps.getDatasetProperties());
    List<IndexedRecord> actual = runRoundTripPipelines(beam, rs.getAllData(), outputProps, inputProps);
    // Generate the set of expected records. By default, CSV turns all columns into String and loses the original
    // column name.
    List<IndexedRecord> expected = rewriteRecordsWithCsvSchema(rs.getAllData());
    assertThat(expected, containsInAnyOrder(actual.toArray()));
    // Verify that the file on the filesystem was correctly written.
    mini.assertReadFile("---", mini.getLocalFs(), fileSpec, rewriteRecordsAsCsvLines(expected, inputProps.getDatasetProperties().getRecordDelimiter(), inputProps.getDatasetProperties().getFieldDelimiter()));
}
Also used : SimpleFileIOOutputProperties(org.talend.components.simplefileio.output.SimpleFileIOOutputProperties) ConvertToIndexedRecord(org.talend.components.adapter.beam.transform.ConvertToIndexedRecord) IndexedRecord(org.apache.avro.generic.IndexedRecord) SimpleFileIOInputProperties(org.talend.components.simplefileio.input.SimpleFileIOInputProperties) RecordSet(org.talend.components.test.RecordSet) Test(org.junit.Test)

Example 15 with SimpleFileIOInputProperties

use of org.talend.components.simplefileio.input.SimpleFileIOInputProperties in project components by Talend.

the class SimpleFileIORoundTripRuntimeTest method testAvro.

/**
 * Basic Avro test.
 */
@Test
public void testAvro() throws IOException {
    // The file that we will be creating.
    RecordSet rs = getSimpleTestData(0);
    String fileSpec = mini.getLocalFsNewFolder() + "output/";
    // Configure the components.
    SimpleFileIOOutputProperties outputProps = createOutputComponentProperties();
    outputProps.getDatasetProperties().format.setValue(SimpleFileIOFormat.AVRO);
    outputProps.getDatasetProperties().path.setValue(fileSpec);
    SimpleFileIOInputProperties inputProps = createInputComponentProperties();
    inputProps.setDatasetProperties(outputProps.getDatasetProperties());
    List<IndexedRecord> actual = runRoundTripPipelines(beam, rs.getAllData(), outputProps, inputProps);
    // Generate the set of expected records. By default, CSV turns all columns into String and loses the original
    // column name.
    List<IndexedRecord> expected = rs.getAllData();
    assertThat(actual, containsInAnyOrder(expected.toArray()));
// Verify that the file on the filesystem was correctly written.
// TODO(rskraba): verify independently
// mini.assertReadFile(
// mini.getLocalFs(),
// fileSpec,
// rewriteRecordsAsCsvLines(expected, inputProps.getDatasetProperties().recordDelimiter.getValue(),
// inputProps.getDatasetProperties().fieldDelimiter.getValue()));
}
Also used : SimpleFileIOOutputProperties(org.talend.components.simplefileio.output.SimpleFileIOOutputProperties) ConvertToIndexedRecord(org.talend.components.adapter.beam.transform.ConvertToIndexedRecord) IndexedRecord(org.apache.avro.generic.IndexedRecord) SimpleFileIOInputProperties(org.talend.components.simplefileio.input.SimpleFileIOInputProperties) RecordSet(org.talend.components.test.RecordSet) Test(org.junit.Test)

Aggregations

SimpleFileIOInputProperties (org.talend.components.simplefileio.input.SimpleFileIOInputProperties)20 IndexedRecord (org.apache.avro.generic.IndexedRecord)17 Test (org.junit.Test)17 ConvertToIndexedRecord (org.talend.components.adapter.beam.transform.ConvertToIndexedRecord)16 Pipeline (org.apache.beam.sdk.Pipeline)13 ArrayList (java.util.ArrayList)10 SimpleFileIOOutputProperties (org.talend.components.simplefileio.output.SimpleFileIOOutputProperties)6 RecordSet (org.talend.components.test.RecordSet)6 Path (org.apache.hadoop.fs.Path)3 InputStream (java.io.InputStream)2 OutputStream (java.io.OutputStream)2 Ignore (org.junit.Ignore)2 TalendRuntimeException (org.talend.daikon.exception.TalendRuntimeException)2 BufferedReader (java.io.BufferedReader)1 InputStreamReader (java.io.InputStreamReader)1 DirectOptions (org.apache.beam.runners.direct.DirectOptions)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 FsPermission (org.apache.hadoop.fs.permission.FsPermission)1 Category (org.junit.experimental.categories.Category)1 SimpleFileIODatasetRuntimeTest (org.talend.components.simplefileio.runtime.SimpleFileIODatasetRuntimeTest)1