use of com.twitter.elephantbird.mapreduce.io.RawBlockWriter in project elephant-bird by twitter.
the class TestErrorsInInput method TestErrorTolerance.
@Test
public void TestErrorTolerance() throws Exception {
// test configurable error tolerance in EB record reader.
Assume.assumeTrue(pigServer != null);
// initialize
String testDir = System.getProperty("test.build.data") + "/TestErrorTolerance";
final File inDir = new File(testDir, "in");
inDir.mkdirs();
// create input with 100 records with 10% of records with errors.
RawBlockWriter blk_writer = new RawBlockWriter(createLzoOut(new File(inDir, "1-block.lzo"), conf));
TestPerson person = records[records.length - 1];
String expectedStr = personToString(person);
byte[] properRec = tConverter.toBytes(person);
byte[] truncatedRec = Arrays.copyOfRange(properRec, 0, properRec.length * 3 / 4);
final int totalRecords = 100;
final int pctErrors = 10;
final int totalErrors = totalRecords * pctErrors / 100;
final int goodRecords = totalRecords - totalErrors;
int corruptIdx = new Random().nextInt(10);
for (int i = 0; i < totalRecords; i++) {
blk_writer.write((i % 10 == corruptIdx) ? truncatedRec : properRec);
}
blk_writer.close();
String[] expectedRows = new String[goodRecords];
for (int i = 0; i < goodRecords; i++) {
expectedRows[i] = expectedStr;
}
// A = load 'in' using ThritPigLoader('TestPerson');
String loadStmt = String.format("A = load '%s' using %s('%s');\n", inDir.toURI().toString(), ThriftPigLoader.class.getName(), TestPerson.class.getName());
// a simple load should fail.
pigServer.registerQuery(loadStmt);
try {
verifyRows(expectedRows, pigServer.openIterator("A"));
Assert.assertFalse("A Pig IOException was expected", true);
} catch (IOException e) {
// expected.
}
// loader should succeed with error rate set to 50%
pigServer.getPigContext().getProperties().setProperty(LzoRecordReader.BAD_RECORD_THRESHOLD_CONF_KEY, "0.5");
pigServer.registerQuery(loadStmt);
verifyRows(expectedRows, pigServer.openIterator("A"));
// set low threshold and test min_error count works.
pigServer.getPigContext().getProperties().setProperty(LzoRecordReader.BAD_RECORD_THRESHOLD_CONF_KEY, "0.0001");
pigServer.getPigContext().getProperties().setProperty(LzoRecordReader.BAD_RECORD_MIN_COUNT_CONF_KEY, "" + (totalErrors + 1));
verifyRows(expectedRows, pigServer.openIterator("A"));
}
use of com.twitter.elephantbird.mapreduce.io.RawBlockWriter in project elephant-bird by twitter.
the class TestErrorsInInput method TestMultiFormatLoaderWithEmptyRecords.
@Test
public void TestMultiFormatLoaderWithEmptyRecords() throws Exception {
Assume.assumeTrue(pigServer != null);
// initalize
String testDir = System.getProperty("test.build.data") + "/TestEmptyRecords";
final File inDir = new File(testDir, "in");
inDir.mkdirs();
// block writer
RawBlockWriter blk_writer = new RawBlockWriter(createLzoOut(new File(inDir, "1-block.lzo"), conf));
// b64 writer
OutputStream b64_writer = createLzoOut(new File(inDir, "2-b64.lzo"), conf);
Base64 base64 = Codecs.createStandardBase64();
for (TestPerson rec : records) {
// write a regular record and an empty record
byte[] bytes = tConverter.toBytes(rec);
blk_writer.write(bytes);
blk_writer.write(new byte[0]);
b64_writer.write(base64.encode(bytes));
b64_writer.write(Protobufs.NEWLINE_UTF8_BYTE);
// empty line.
b64_writer.write(Protobufs.NEWLINE_UTF8_BYTE);
}
blk_writer.close();
b64_writer.close();
// end of initialization.
pigServer.registerQuery(String.format("A = load '%s' using %s('%s');\n", inDir.toURI().toString(), MultiFormatLoader.class.getName(), TestPerson.class.getName()));
Iterator<Tuple> rows = pigServer.openIterator("A");
// verify:
for (int i = 0; i < 2; i++) {
for (TestPerson person : records) {
String expected = personToString(person);
Assert.assertEquals(expected, rows.next().toString());
}
}
FileUtil.fullyDelete(new File(testDir));
}
Aggregations