Search in sources :

Example 16 with LineByLineFileInputOperator

use of org.apache.apex.malhar.lib.fs.LineByLineFileInputOperator in project apex-malhar by apache.

the class AbstractFileInputOperatorTest method testStateWithIdempotency.

@Test
public void testStateWithIdempotency() throws Exception {
    FileContext.getLocalFSFileContext().delete(new Path(new File(testMeta.dir).getAbsolutePath()), true);
    HashSet<String> allLines = Sets.newHashSet();
    for (int file = 0; file < 3; file++) {
        HashSet<String> lines = Sets.newHashSet();
        for (int line = 0; line < 2; line++) {
            lines.add("f" + file + "l" + line);
        }
        allLines.addAll(lines);
        FileUtils.write(new File(testMeta.dir, "file" + file), StringUtils.join(lines, '\n'));
    }
    LineByLineFileInputOperator oper = new LineByLineFileInputOperator();
    FSWindowDataManager manager = new FSWindowDataManager();
    manager.setStatePath(testMeta.dir + "/recovery");
    oper.setWindowDataManager(manager);
    CollectorTestSink<String> queryResults = new CollectorTestSink<String>();
    @SuppressWarnings({ "unchecked", "rawtypes" }) CollectorTestSink<Object> sink = (CollectorTestSink) queryResults;
    oper.output.setSink(sink);
    oper.setDirectory(testMeta.dir);
    oper.getScanner().setFilePatternRegexp(".*file[\\d]");
    oper.setup(testMeta.context);
    for (long wid = 0; wid < 4; wid++) {
        oper.beginWindow(wid);
        oper.emitTuples();
        oper.endWindow();
    }
    oper.teardown();
    sink.clear();
    // idempotency  part
    oper.pendingFiles.add(new File(testMeta.dir, "file0").getAbsolutePath());
    oper.failedFiles.add(new AbstractFileInputOperator.FailedFile(new File(testMeta.dir, "file1").getAbsolutePath(), 0));
    oper.unfinishedFiles.add(new AbstractFileInputOperator.FailedFile(new File(testMeta.dir, "file2").getAbsolutePath(), 0));
    oper.setup(testMeta.context);
    for (long wid = 0; wid < 4; wid++) {
        oper.beginWindow(wid);
        oper.endWindow();
    }
    Assert.assertTrue("pending state", !oper.pendingFiles.contains("file0"));
    for (AbstractFileInputOperator.FailedFile failedFile : oper.failedFiles) {
        Assert.assertTrue("failed state", !failedFile.path.equals("file1"));
    }
    for (AbstractFileInputOperator.FailedFile unfinishedFile : oper.unfinishedFiles) {
        Assert.assertTrue("unfinished state", !unfinishedFile.path.equals("file2"));
    }
    oper.teardown();
}
Also used : Path(org.apache.hadoop.fs.Path) FSWindowDataManager(org.apache.apex.malhar.lib.wal.FSWindowDataManager) LineByLineFileInputOperator(org.apache.apex.malhar.lib.fs.LineByLineFileInputOperator) File(java.io.File) CollectorTestSink(org.apache.apex.malhar.lib.testbench.CollectorTestSink) Test(org.junit.Test)

Example 17 with LineByLineFileInputOperator

use of org.apache.apex.malhar.lib.fs.LineByLineFileInputOperator in project apex-malhar by apache.

the class AbstractFileInputOperatorTest method checkSubDir.

private void checkSubDir(boolean recursive) throws Exception {
    FileContext.getLocalFSFileContext().delete(new Path(new File(testMeta.dir).getAbsolutePath()), true);
    HashSet<String> allLines = Sets.newHashSet();
    String subdir = "";
    for (int file = 0; file < 2; file++) {
        subdir += String.format("/depth_%d", file);
        HashSet<String> lines = Sets.newHashSet();
        for (int line = 0; line < 2; line++) {
            lines.add("f" + file + "l" + line);
        }
        allLines.addAll(lines);
        FileUtils.write(new File(testMeta.dir + subdir, "file" + file), StringUtils.join(lines, '\n'));
    }
    LineByLineFileInputOperator oper = new LineByLineFileInputOperator();
    CollectorTestSink<String> queryResults = new CollectorTestSink<String>();
    @SuppressWarnings({ "unchecked", "rawtypes" }) CollectorTestSink<Object> sink = (CollectorTestSink) queryResults;
    oper.output.setSink(sink);
    oper.setDirectory(testMeta.dir);
    oper.getScanner().setFilePatternRegexp("((?!target).)*file[\\d]");
    oper.getScanner().setRecursive(recursive);
    oper.setup(testMeta.context);
    for (long wid = 0; wid < 3; wid++) {
        oper.beginWindow(wid);
        oper.emitTuples();
        oper.endWindow();
    }
    oper.teardown();
    int expectedNumTuples = 4;
    if (!recursive) {
        allLines = new HashSet<String>();
        expectedNumTuples = 0;
    }
    Assert.assertEquals("number tuples", expectedNumTuples, queryResults.collectedTuples.size());
    Assert.assertEquals("lines", allLines, new HashSet<String>(queryResults.collectedTuples));
}
Also used : Path(org.apache.hadoop.fs.Path) LineByLineFileInputOperator(org.apache.apex.malhar.lib.fs.LineByLineFileInputOperator) File(java.io.File) CollectorTestSink(org.apache.apex.malhar.lib.testbench.CollectorTestSink)

Aggregations

LineByLineFileInputOperator (org.apache.apex.malhar.lib.fs.LineByLineFileInputOperator)17 File (java.io.File)15 Path (org.apache.hadoop.fs.Path)15 Test (org.junit.Test)14 CollectorTestSink (org.apache.apex.malhar.lib.testbench.CollectorTestSink)12 DefaultPartition (com.datatorrent.api.DefaultPartition)6 Partition (com.datatorrent.api.Partitioner.Partition)6 PartitioningContextImpl (org.apache.apex.malhar.lib.partitioner.StatelessPartitionerTest.PartitioningContextImpl)6 FSWindowDataManager (org.apache.apex.malhar.lib.wal.FSWindowDataManager)5 StatsListener (com.datatorrent.api.StatsListener)3 Kryo (com.esotericsoftware.kryo.Kryo)3 Configuration (org.apache.hadoop.conf.Configuration)2 Random (java.util.Random)1