Search in sources :

Example 1 with RawBlockWriter

use of com.twitter.elephantbird.mapreduce.io.RawBlockWriter in project elephant-bird by twitter.

the class TestErrorsInInput method TestErrorTolerance.

@Test
public void TestErrorTolerance() throws Exception {
    // test configurable error tolerance in EB record reader.
    Assume.assumeTrue(pigServer != null);
    // initialize
    String testDir = System.getProperty("test.build.data") + "/TestErrorTolerance";
    final File inDir = new File(testDir, "in");
    inDir.mkdirs();
    // create input with 100 records with 10% of records with errors.
    RawBlockWriter blk_writer = new RawBlockWriter(createLzoOut(new File(inDir, "1-block.lzo"), conf));
    TestPerson person = records[records.length - 1];
    String expectedStr = personToString(person);
    byte[] properRec = tConverter.toBytes(person);
    byte[] truncatedRec = Arrays.copyOfRange(properRec, 0, properRec.length * 3 / 4);
    final int totalRecords = 100;
    final int pctErrors = 10;
    final int totalErrors = totalRecords * pctErrors / 100;
    final int goodRecords = totalRecords - totalErrors;
    int corruptIdx = new Random().nextInt(10);
    for (int i = 0; i < totalRecords; i++) {
        blk_writer.write((i % 10 == corruptIdx) ? truncatedRec : properRec);
    }
    blk_writer.close();
    String[] expectedRows = new String[goodRecords];
    for (int i = 0; i < goodRecords; i++) {
        expectedRows[i] = expectedStr;
    }
    // A = load 'in' using ThritPigLoader('TestPerson');
    String loadStmt = String.format("A = load '%s' using %s('%s');\n", inDir.toURI().toString(), ThriftPigLoader.class.getName(), TestPerson.class.getName());
    // a simple load should fail.
    pigServer.registerQuery(loadStmt);
    try {
        verifyRows(expectedRows, pigServer.openIterator("A"));
        Assert.assertFalse("A Pig IOException was expected", true);
    } catch (IOException e) {
    // expected.
    }
    // loader should succeed with error rate set to 50%
    pigServer.getPigContext().getProperties().setProperty(LzoRecordReader.BAD_RECORD_THRESHOLD_CONF_KEY, "0.5");
    pigServer.registerQuery(loadStmt);
    verifyRows(expectedRows, pigServer.openIterator("A"));
    // set low threshold and test min_error count works.
    pigServer.getPigContext().getProperties().setProperty(LzoRecordReader.BAD_RECORD_THRESHOLD_CONF_KEY, "0.0001");
    pigServer.getPigContext().getProperties().setProperty(LzoRecordReader.BAD_RECORD_MIN_COUNT_CONF_KEY, "" + (totalErrors + 1));
    verifyRows(expectedRows, pigServer.openIterator("A"));
}
Also used : Random(java.util.Random) RawBlockWriter(com.twitter.elephantbird.mapreduce.io.RawBlockWriter) IOException(java.io.IOException) TestPerson(com.twitter.elephantbird.thrift.test.TestPerson) File(java.io.File) Test(org.junit.Test)

Example 2 with RawBlockWriter

use of com.twitter.elephantbird.mapreduce.io.RawBlockWriter in project elephant-bird by twitter.

the class TestErrorsInInput method TestMultiFormatLoaderWithEmptyRecords.

@Test
public void TestMultiFormatLoaderWithEmptyRecords() throws Exception {
    Assume.assumeTrue(pigServer != null);
    // initalize
    String testDir = System.getProperty("test.build.data") + "/TestEmptyRecords";
    final File inDir = new File(testDir, "in");
    inDir.mkdirs();
    // block writer
    RawBlockWriter blk_writer = new RawBlockWriter(createLzoOut(new File(inDir, "1-block.lzo"), conf));
    // b64 writer
    OutputStream b64_writer = createLzoOut(new File(inDir, "2-b64.lzo"), conf);
    Base64 base64 = Codecs.createStandardBase64();
    for (TestPerson rec : records) {
        // write a regular record and an empty record
        byte[] bytes = tConverter.toBytes(rec);
        blk_writer.write(bytes);
        blk_writer.write(new byte[0]);
        b64_writer.write(base64.encode(bytes));
        b64_writer.write(Protobufs.NEWLINE_UTF8_BYTE);
        // empty line.
        b64_writer.write(Protobufs.NEWLINE_UTF8_BYTE);
    }
    blk_writer.close();
    b64_writer.close();
    // end of initialization.
    pigServer.registerQuery(String.format("A = load '%s' using %s('%s');\n", inDir.toURI().toString(), MultiFormatLoader.class.getName(), TestPerson.class.getName()));
    Iterator<Tuple> rows = pigServer.openIterator("A");
    // verify:
    for (int i = 0; i < 2; i++) {
        for (TestPerson person : records) {
            String expected = personToString(person);
            Assert.assertEquals(expected, rows.next().toString());
        }
    }
    FileUtil.fullyDelete(new File(testDir));
}
Also used : Base64(org.apache.commons.codec.binary.Base64) RawBlockWriter(com.twitter.elephantbird.mapreduce.io.RawBlockWriter) DataOutputStream(java.io.DataOutputStream) OutputStream(java.io.OutputStream) FileOutputStream(java.io.FileOutputStream) TestPerson(com.twitter.elephantbird.thrift.test.TestPerson) File(java.io.File) Tuple(org.apache.pig.data.Tuple) Test(org.junit.Test)

Aggregations

RawBlockWriter (com.twitter.elephantbird.mapreduce.io.RawBlockWriter)2 TestPerson (com.twitter.elephantbird.thrift.test.TestPerson)2 File (java.io.File)2 Test (org.junit.Test)2 DataOutputStream (java.io.DataOutputStream)1 FileOutputStream (java.io.FileOutputStream)1 IOException (java.io.IOException)1 OutputStream (java.io.OutputStream)1 Random (java.util.Random)1 Base64 (org.apache.commons.codec.binary.Base64)1 Tuple (org.apache.pig.data.Tuple)1