Examples with LineByLineFileInputOperator - org.apache.apex.malhar.lib.fs.LineByLineFileInputOperator

Example 11 with LineByLineFileInputOperator

use of org.apache.apex.malhar.lib.fs.LineByLineFileInputOperator in project apex-malhar by apache.

the class AbstractFileInputOperatorTest method testRecoveryWithPendingFile.

@Test
public void testRecoveryWithPendingFile() throws Exception {
    FileContext.getLocalFSFileContext().delete(new Path(new File(testMeta.dir).getAbsolutePath()), true);
    List<String> allLines = Lists.newArrayList();
    HashSet<String> lines = Sets.newHashSet();
    for (int line = 0; line < 5; line++) {
        lines.add("f0" + "l" + line);
    }
    allLines.addAll(lines);
    File testFile = new File(testMeta.dir, "file0");
    FileUtils.write(testFile, StringUtils.join(lines, '\n'));
    LineByLineFileInputOperator oper = new LineByLineFileInputOperator();
    oper.scanner = null;
    oper.pendingFiles.add(testFile.getAbsolutePath());
    CollectorTestSink<String> queryResults = new CollectorTestSink<String>();
    @SuppressWarnings({ "unchecked", "rawtypes" }) CollectorTestSink<Object> sink = (CollectorTestSink) queryResults;
    oper.output.setSink(sink);
    oper.setDirectory(testMeta.dir);
    oper.setup(testMeta.context);
    oper.beginWindow(0);
    oper.emitTuples();
    oper.endWindow();
    oper.teardown();
    Assert.assertEquals("number tuples", 5, queryResults.collectedTuples.size());
    Assert.assertEquals("lines", allLines, new ArrayList<String>(queryResults.collectedTuples));
}

Also used : Path(org.apache.hadoop.fs.Path) LineByLineFileInputOperator(org.apache.apex.malhar.lib.fs.LineByLineFileInputOperator) File(java.io.File) CollectorTestSink(org.apache.apex.malhar.lib.testbench.CollectorTestSink) Test(org.junit.Test)

Example 12 with LineByLineFileInputOperator

use of org.apache.apex.malhar.lib.fs.LineByLineFileInputOperator in project apex-malhar by apache.

the class AbstractFileInputOperatorTest method testIdempotencyWithMultipleEmitTuples.

@Test
public void testIdempotencyWithMultipleEmitTuples() throws Exception {
    FileContext.getLocalFSFileContext().delete(new Path(new File(testMeta.dir).getAbsolutePath()), true);
    List<String> allLines = Lists.newArrayList();
    for (int file = 0; file < 2; file++) {
        List<String> lines = Lists.newArrayList();
        for (int line = 0; line < 2; line++) {
            lines.add("f" + file + "l" + line);
        }
        allLines.addAll(lines);
        FileUtils.write(new File(testMeta.dir, "file" + file), StringUtils.join(lines, '\n'));
    }
    LineByLineFileInputOperator oper = new LineByLineFileInputOperator();
    FSWindowDataManager manager = new FSWindowDataManager();
    manager.setStatePath(testMeta.dir + "/recovery");
    oper.setWindowDataManager(manager);
    CollectorTestSink<String> queryResults = new CollectorTestSink<String>();
    TestUtils.setSink(oper.output, queryResults);
    oper.setDirectory(testMeta.dir);
    oper.getScanner().setFilePatternRegexp(".*file[\\d]");
    oper.setup(testMeta.context);
    oper.beginWindow(0);
    for (int i = 0; i < 3; i++) {
        oper.emitTuples();
    }
    oper.endWindow();
    oper.teardown();
    List<String> beforeRecovery = Lists.newArrayList(queryResults.collectedTuples);
    queryResults.clear();
    // idempotency  part
    oper.setup(testMeta.context);
    oper.beginWindow(0);
    oper.endWindow();
    Assert.assertEquals("number tuples", 4, queryResults.collectedTuples.size());
    Assert.assertEquals("lines", beforeRecovery, queryResults.collectedTuples);
    oper.teardown();
}

Also used : Path(org.apache.hadoop.fs.Path) LineByLineFileInputOperator(org.apache.apex.malhar.lib.fs.LineByLineFileInputOperator) File(java.io.File) FSWindowDataManager(org.apache.apex.malhar.lib.wal.FSWindowDataManager) CollectorTestSink(org.apache.apex.malhar.lib.testbench.CollectorTestSink) Test(org.junit.Test)

Example 13 with LineByLineFileInputOperator

use of org.apache.apex.malhar.lib.fs.LineByLineFileInputOperator in project apex-malhar by apache.

the class AbstractFileInputOperatorTest method testWithCustomScanner.

/**
 * Partition the operator in 2
 * create ten files with index of the file at the start, i.e 1_file, 2_file .. etc.
 * The scanner returns this index from getPartition method.
 * each partition should read 5 files as file index are from 0 to 9 (including 0 and 9).
 * @throws Exception
 */
@Test
public void testWithCustomScanner() throws Exception {
    LineByLineFileInputOperator oper = new LineByLineFileInputOperator();
    oper.setScanner(new MyScanner());
    oper.getScanner().setFilePatternRegexp(".*partition_([\\d]*)");
    oper.setDirectory(new File(testMeta.dir).getAbsolutePath());
    Random rand = new Random();
    Path path = new Path(new File(testMeta.dir).getAbsolutePath());
    FileContext.getLocalFSFileContext().delete(path, true);
    for (int file = 0; file < 10; file++) {
        FileUtils.write(new File(testMeta.dir, file + "_partition_00" + rand.nextInt(100)), "");
    }
    List<Partition<AbstractFileInputOperator<String>>> partitions = Lists.newArrayList();
    partitions.add(new DefaultPartition<AbstractFileInputOperator<String>>(oper));
    Collection<Partition<AbstractFileInputOperator<String>>> newPartitions = oper.definePartitions(partitions, new PartitioningContextImpl(null, 2));
    Assert.assertEquals(2, newPartitions.size());
    // partitioned() wasn't called
    Assert.assertEquals(1, oper.getCurrentPartitions());
    for (Partition<AbstractFileInputOperator<String>> p : newPartitions) {
        Assert.assertNotSame(oper, p.getPartitionedInstance());
        Assert.assertNotSame(oper.getScanner(), p.getPartitionedInstance().getScanner());
        Set<String> consumed = Sets.newHashSet();
        LinkedHashSet<Path> files = p.getPartitionedInstance().getScanner().scan(FileSystem.getLocal(new Configuration(false)), path, consumed);
        Assert.assertEquals("partition " + files, 6, files.size());
    }
}

Also used : Path(org.apache.hadoop.fs.Path) Partition(com.datatorrent.api.Partitioner.Partition) DefaultPartition(com.datatorrent.api.DefaultPartition) Configuration(org.apache.hadoop.conf.Configuration) Random(java.util.Random) PartitioningContextImpl(org.apache.apex.malhar.lib.partitioner.StatelessPartitionerTest.PartitioningContextImpl) LineByLineFileInputOperator(org.apache.apex.malhar.lib.fs.LineByLineFileInputOperator) File(java.io.File) Test(org.junit.Test)

Example 14 with LineByLineFileInputOperator

use of org.apache.apex.malhar.lib.fs.LineByLineFileInputOperator in project apex-malhar by apache.

the class AbstractFileInputOperatorTest method testPartitioning.

@Test
public void testPartitioning() throws Exception {
    LineByLineFileInputOperator oper = new LineByLineFileInputOperator();
    oper.getScanner().setFilePatternRegexp(".*partition([\\d]*)");
    oper.setDirectory(new File(testMeta.dir).getAbsolutePath());
    Path path = new Path(new File(testMeta.dir).getAbsolutePath());
    FileContext.getLocalFSFileContext().delete(path, true);
    for (int file = 0; file < 4; file++) {
        FileUtils.write(new File(testMeta.dir, "partition00" + file), "");
    }
    List<Partition<AbstractFileInputOperator<String>>> partitions = Lists.newArrayList();
    partitions.add(new DefaultPartition<AbstractFileInputOperator<String>>(oper));
    Collection<Partition<AbstractFileInputOperator<String>>> newPartitions = oper.definePartitions(partitions, new PartitioningContextImpl(null, 2));
    Assert.assertEquals(2, newPartitions.size());
    // partitioned() wasn't called
    Assert.assertEquals(1, oper.getCurrentPartitions());
    for (Partition<AbstractFileInputOperator<String>> p : newPartitions) {
        Assert.assertNotSame(oper, p.getPartitionedInstance());
        Assert.assertNotSame(oper.getScanner(), p.getPartitionedInstance().getScanner());
        Set<String> consumed = Sets.newHashSet();
        LinkedHashSet<Path> files = p.getPartitionedInstance().getScanner().scan(FileSystem.getLocal(new Configuration(false)), path, consumed);
        Assert.assertEquals("partition " + files, 3, files.size());
    }
}

Also used : Path(org.apache.hadoop.fs.Path) Partition(com.datatorrent.api.Partitioner.Partition) DefaultPartition(com.datatorrent.api.DefaultPartition) Configuration(org.apache.hadoop.conf.Configuration) PartitioningContextImpl(org.apache.apex.malhar.lib.partitioner.StatelessPartitionerTest.PartitioningContextImpl) LineByLineFileInputOperator(org.apache.apex.malhar.lib.fs.LineByLineFileInputOperator) File(java.io.File) Test(org.junit.Test)

Example 15 with LineByLineFileInputOperator

use of org.apache.apex.malhar.lib.fs.LineByLineFileInputOperator in project apex-malhar by apache.

the class AbstractFileInputOperatorTest method testWindowDataManagerPartitioning.

@Test
public void testWindowDataManagerPartitioning() throws Exception {
    LineByLineFileInputOperator oper = new LineByLineFileInputOperator();
    oper.getScanner().setFilePatternRegexp(".*partition([\\d]*)");
    oper.setDirectory(new File(testMeta.dir).getAbsolutePath());
    oper.setWindowDataManager(new FSWindowDataManager());
    oper.operatorId = 7;
    Path path = new Path(new File(testMeta.dir).getAbsolutePath());
    FileContext.getLocalFSFileContext().delete(path, true);
    for (int file = 0; file < 4; file++) {
        FileUtils.write(new File(testMeta.dir, "partition00" + file), "");
    }
    List<Partition<AbstractFileInputOperator<String>>> partitions = Lists.newArrayList();
    partitions.add(new DefaultPartition<AbstractFileInputOperator<String>>(oper));
    Collection<Partition<AbstractFileInputOperator<String>>> newPartitions = oper.definePartitions(partitions, new PartitioningContextImpl(null, 2));
    Assert.assertEquals(2, newPartitions.size());
    Assert.assertEquals(1, oper.getCurrentPartitions());
    List<FSWindowDataManager> storageManagers = Lists.newLinkedList();
    for (Partition<AbstractFileInputOperator<String>> p : newPartitions) {
        storageManagers.add((FSWindowDataManager) p.getPartitionedInstance().getWindowDataManager());
    }
    Assert.assertEquals("count of storage managers", 2, storageManagers.size());
    int countOfDeleteManagers = 0;
    FSWindowDataManager deleteManager = null;
    for (FSWindowDataManager storageManager : storageManagers) {
        if (storageManager.getDeletedOperators() != null) {
            countOfDeleteManagers++;
            deleteManager = storageManager;
        }
    }
    Assert.assertEquals("count of delete managers", 1, countOfDeleteManagers);
    Assert.assertNotNull("deleted operators manager", deleteManager);
    Assert.assertEquals("deleted operators", Sets.newHashSet(7), deleteManager.getDeletedOperators());
}

Also used : Path(org.apache.hadoop.fs.Path) Partition(com.datatorrent.api.Partitioner.Partition) DefaultPartition(com.datatorrent.api.DefaultPartition) FSWindowDataManager(org.apache.apex.malhar.lib.wal.FSWindowDataManager) PartitioningContextImpl(org.apache.apex.malhar.lib.partitioner.StatelessPartitionerTest.PartitioningContextImpl) LineByLineFileInputOperator(org.apache.apex.malhar.lib.fs.LineByLineFileInputOperator) File(java.io.File) Test(org.junit.Test)

Aggregations

LineByLineFileInputOperator (org.apache.apex.malhar.lib.fs.LineByLineFileInputOperator)17 File (java.io.File)15 Path (org.apache.hadoop.fs.Path)15 Test (org.junit.Test)14 CollectorTestSink (org.apache.apex.malhar.lib.testbench.CollectorTestSink)12 DefaultPartition (com.datatorrent.api.DefaultPartition)6 Partition (com.datatorrent.api.Partitioner.Partition)6 PartitioningContextImpl (org.apache.apex.malhar.lib.partitioner.StatelessPartitionerTest.PartitioningContextImpl)6 FSWindowDataManager (org.apache.apex.malhar.lib.wal.FSWindowDataManager)5 StatsListener (com.datatorrent.api.StatsListener)3 Kryo (com.esotericsoftware.kryo.Kryo)3 Configuration (org.apache.hadoop.conf.Configuration)2 Random (java.util.Random)1