Search in sources :

Example 6 with SimpleFileIOInputProperties

use of org.talend.components.simplefileio.input.SimpleFileIOInputProperties in project components by Talend.

the class SimpleFileIOInputRuntimeTest method testBasicCSV_changeSeparator2.

/**
 * Basic unit test using all default values (except for the path) on an in-memory DFS cluster.
 */
@Test
public void testBasicCSV_changeSeparator2() throws IOException, URISyntaxException {
    String inputFile = writeRandomCsvFile(mini.getFs(), "/user/test/input.csv", 0, 0, 10, 10, 6, "\t", "\r");
    String fileSpec = mini.getFs().getUri().resolve("/user/test/input.csv").toString();
    // Configure the component.
    SimpleFileIOInputProperties inputProps = createInputComponentProperties();
    inputProps.getDatasetProperties().path.setValue(fileSpec);
    inputProps.getDatasetProperties().recordDelimiter.setValue(RecordDelimiterType.CR);
    inputProps.getDatasetProperties().fieldDelimiter.setValue(FieldDelimiterType.TABULATION);
    // Create the runtime.
    SimpleFileIOInputRuntime runtime = new SimpleFileIOInputRuntime();
    runtime.initialize(null, inputProps);
    // Use the runtime in a direct pipeline to test.
    final Pipeline p = beam.createPipeline();
    PCollection<IndexedRecord> readLines = p.apply(runtime);
    // Check the expected values.
    List<IndexedRecord> expected = new ArrayList<>();
    for (String record : inputFile.split("\r")) {
        expected.add(ConvertToIndexedRecord.convertToAvro(record.split("\t")));
    }
    PAssert.that(readLines).containsInAnyOrder(expected);
    // And run the test.
    p.run().waitUntilFinish();
}
Also used : ConvertToIndexedRecord(org.talend.components.adapter.beam.transform.ConvertToIndexedRecord) IndexedRecord(org.apache.avro.generic.IndexedRecord) ArrayList(java.util.ArrayList) SimpleFileIOInputProperties(org.talend.components.simplefileio.input.SimpleFileIOInputProperties) Pipeline(org.apache.beam.sdk.Pipeline) Test(org.junit.Test)

Example 7 with SimpleFileIOInputProperties

use of org.talend.components.simplefileio.input.SimpleFileIOInputProperties in project components by Talend.

the class SimpleFileIOInputRuntimeTest method testBasicCsvFormatting.

@Test
public void testBasicCsvFormatting() throws IOException, URISyntaxException {
    // Create an input file with all of the 3 column examples from the examples.
    List<IndexedRecord> expected = new ArrayList<>();
    List<String> file = new ArrayList<>();
    for (CsvExample csvEx : CsvExample.getCsvExamples()) {
        // Ignore lines that don't have the same schema (3 columns)
        if (csvEx.getValues().length == 3) {
            for (String inputLine : csvEx.getPossibleInputLines()) {
                file.add(inputLine);
                expected.add(ConvertToIndexedRecord.convertToAvro(csvEx.getValues()));
            }
        }
    }
    mini.writeFile(mini.getFs(), "/user/test/input.csv", file.toArray(new String[0]));
    String fileSpec = mini.getFs().getUri().resolve("/user/test/input.csv").toString();
    // Configure the component.
    SimpleFileIOInputProperties inputProps = createInputComponentProperties();
    inputProps.getDatasetProperties().path.setValue(fileSpec);
    // Create the runtime.
    SimpleFileIOInputRuntime runtime = new SimpleFileIOInputRuntime();
    runtime.initialize(null, inputProps);
    // Use the runtime in a direct pipeline to test.
    final Pipeline p = beam.createPipeline(3);
    PCollection<IndexedRecord> readLines = p.apply(runtime);
    PAssert.that(readLines).containsInAnyOrder(expected);
    p.run().waitUntilFinish();
}
Also used : ConvertToIndexedRecord(org.talend.components.adapter.beam.transform.ConvertToIndexedRecord) IndexedRecord(org.apache.avro.generic.IndexedRecord) ArrayList(java.util.ArrayList) SimpleFileIOInputProperties(org.talend.components.simplefileio.input.SimpleFileIOInputProperties) Pipeline(org.apache.beam.sdk.Pipeline) Test(org.junit.Test)

Example 8 with SimpleFileIOInputProperties

use of org.talend.components.simplefileio.input.SimpleFileIOInputProperties in project components by Talend.

the class SimpleFileIOInputRuntimeTest method testBasicParquet.

/**
 * Basic unit test using all default values (except for the path) on an in-memory DFS cluster.
 */
@Ignore("To implement.")
@Test
public void testBasicParquet() throws IOException, URISyntaxException {
    RecordSet rs = getSimpleTestData(0);
    writeRandomAvroFile(mini.getFs(), "/user/test/input.avro", rs);
    String fileSpec = mini.getFs().getUri().resolve("/user/test/input.avro").toString();
    // Configure the component.
    SimpleFileIOInputProperties inputProps = createInputComponentProperties();
    inputProps.getDatasetProperties().format.setValue(SimpleFileIOFormat.PARQUET);
    inputProps.getDatasetProperties().path.setValue(fileSpec);
    // Create the runtime.
    SimpleFileIOInputRuntime runtime = new SimpleFileIOInputRuntime();
    runtime.initialize(null, inputProps);
    // Use the runtime in a direct pipeline to test.
    final Pipeline p = beam.createPipeline();
    PCollection<IndexedRecord> readLines = p.apply(runtime);
    // Check the expected values.
    PAssert.that(readLines).containsInAnyOrder(rs.getAllData());
    // And run the test.
    p.run().waitUntilFinish();
}
Also used : ConvertToIndexedRecord(org.talend.components.adapter.beam.transform.ConvertToIndexedRecord) IndexedRecord(org.apache.avro.generic.IndexedRecord) SimpleFileIOInputProperties(org.talend.components.simplefileio.input.SimpleFileIOInputProperties) RecordSet(org.talend.components.test.RecordSet) Pipeline(org.apache.beam.sdk.Pipeline) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 9 with SimpleFileIOInputProperties

use of org.talend.components.simplefileio.input.SimpleFileIOInputProperties in project components by Talend.

the class SimpleFileIOInputRuntimeTest method testBasicDefaultsInvalidColumnNumber.

/**
 * In the next test, we are providing a file with 32 columns, but some of the lines does not contains 32 columns.
 */
@Ignore("Encoding trouble in the test.")
@Test
public void testBasicDefaultsInvalidColumnNumber() throws IOException {
    InputStream in = getClass().getResourceAsStream("invalidColumnNumber.txt");
    try (OutputStream inOnMinFS = mini.getFs().create(new Path("/user/test/invalidColumnNumber.txt"))) {
        inOnMinFS.write(IOUtils.toByteArray(in));
    }
    String fileSpec = mini.getFs().getUri().resolve("/user/test/invalidColumnNumber.txt").toString();
    // Configure the component.
    SimpleFileIOInputProperties inputProps = createInputComponentProperties();
    inputProps.getDatasetProperties().path.setValue(fileSpec);
    // Create the runtime.
    SimpleFileIOInputRuntime runtime = new SimpleFileIOInputRuntime();
    runtime.initialize(null, inputProps);
    // Use the runtime in a direct pipeline to test.
    final Pipeline p = beam.createPipeline();
    PCollection<IndexedRecord> readLines = p.apply(runtime);
    // Check the expected values.
    in = getClass().getResourceAsStream("invalidColumnNumber.txt");
    BufferedReader br = new BufferedReader(new InputStreamReader(in));
    List<IndexedRecord> expected = new ArrayList<>();
    String sCurrentLine = "";
    while ((sCurrentLine = br.readLine()) != null) {
        expected.add(ConvertToIndexedRecord.convertToAvro(sCurrentLine.split(";")));
    }
    PAssert.that(readLines).containsInAnyOrder(expected);
    // And run the test.
    p.run().waitUntilFinish();
}
Also used : Path(org.apache.hadoop.fs.Path) ConvertToIndexedRecord(org.talend.components.adapter.beam.transform.ConvertToIndexedRecord) IndexedRecord(org.apache.avro.generic.IndexedRecord) InputStreamReader(java.io.InputStreamReader) InputStream(java.io.InputStream) OutputStream(java.io.OutputStream) ArrayList(java.util.ArrayList) Pipeline(org.apache.beam.sdk.Pipeline) BufferedReader(java.io.BufferedReader) SimpleFileIOInputProperties(org.talend.components.simplefileio.input.SimpleFileIOInputProperties) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 10 with SimpleFileIOInputProperties

use of org.talend.components.simplefileio.input.SimpleFileIOInputProperties in project components by Talend.

the class SimpleFileIOInputRuntimeTest method testBasicCSV_changeSeparator.

/**
 * Basic unit test using all default values (except for the path) on an in-memory DFS cluster.
 */
@Test
public void testBasicCSV_changeSeparator() throws IOException, URISyntaxException {
    String inputFile = writeRandomCsvFile(mini.getFs(), "/user/test/input.csv", 0, 0, 10, 10, 6, " ", "\r\n");
    String fileSpec = mini.getFs().getUri().resolve("/user/test/input.csv").toString();
    // Configure the component.
    SimpleFileIOInputProperties inputProps = createInputComponentProperties();
    inputProps.getDatasetProperties().path.setValue(fileSpec);
    inputProps.getDatasetProperties().recordDelimiter.setValue(RecordDelimiterType.CRLF);
    inputProps.getDatasetProperties().fieldDelimiter.setValue(FieldDelimiterType.SPACE);
    // Create the runtime.
    SimpleFileIOInputRuntime runtime = new SimpleFileIOInputRuntime();
    runtime.initialize(null, inputProps);
    // Use the runtime in a direct pipeline to test.
    final Pipeline p = beam.createPipeline();
    PCollection<IndexedRecord> readLines = p.apply(runtime);
    // Check the expected values.
    List<IndexedRecord> expected = new ArrayList<>();
    for (String record : inputFile.split("\r\n")) {
        expected.add(ConvertToIndexedRecord.convertToAvro(record.split(" ")));
    }
    PAssert.that(readLines).containsInAnyOrder(expected);
    // And run the test.
    p.run().waitUntilFinish();
}
Also used : ConvertToIndexedRecord(org.talend.components.adapter.beam.transform.ConvertToIndexedRecord) IndexedRecord(org.apache.avro.generic.IndexedRecord) ArrayList(java.util.ArrayList) SimpleFileIOInputProperties(org.talend.components.simplefileio.input.SimpleFileIOInputProperties) Pipeline(org.apache.beam.sdk.Pipeline) Test(org.junit.Test)

Aggregations

SimpleFileIOInputProperties (org.talend.components.simplefileio.input.SimpleFileIOInputProperties)20 IndexedRecord (org.apache.avro.generic.IndexedRecord)17 Test (org.junit.Test)17 ConvertToIndexedRecord (org.talend.components.adapter.beam.transform.ConvertToIndexedRecord)16 Pipeline (org.apache.beam.sdk.Pipeline)13 ArrayList (java.util.ArrayList)10 SimpleFileIOOutputProperties (org.talend.components.simplefileio.output.SimpleFileIOOutputProperties)6 RecordSet (org.talend.components.test.RecordSet)6 Path (org.apache.hadoop.fs.Path)3 InputStream (java.io.InputStream)2 OutputStream (java.io.OutputStream)2 Ignore (org.junit.Ignore)2 TalendRuntimeException (org.talend.daikon.exception.TalendRuntimeException)2 BufferedReader (java.io.BufferedReader)1 InputStreamReader (java.io.InputStreamReader)1 DirectOptions (org.apache.beam.runners.direct.DirectOptions)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 FsPermission (org.apache.hadoop.fs.permission.FsPermission)1 Category (org.junit.experimental.categories.Category)1 SimpleFileIODatasetRuntimeTest (org.talend.components.simplefileio.runtime.SimpleFileIODatasetRuntimeTest)1