use of org.apache.hadoop.io.compress.DefaultCodec in project hive by apache.
the class RCFileGenerator method genData.
private static void genData(String format, int numRows, String output, String plainOutput) throws Exception {
int numFields = 0;
if (format.equals("student")) {
rand = new Random(numRows);
numFields = 3;
} else if (format.equals("voter")) {
rand = new Random(1000000000 + numRows);
numFields = 4;
} else if (format.equals("alltypes")) {
rand = new Random(2000000000L + numRows);
numFields = 10;
}
RCFileOutputFormat.setColumnNumber(conf, numFields);
RCFile.Writer writer = new RCFile.Writer(fs, conf, getFile(output), null, new DefaultCodec());
PrintWriter pw = new PrintWriter(new FileWriter(plainOutput));
for (int j = 0; j < numRows; j++) {
BytesRefArrayWritable row = new BytesRefArrayWritable(numFields);
byte[][] fields = null;
if (format.equals("student")) {
byte[][] f = { randomName().getBytes("UTF-8"), Integer.valueOf(randomAge()).toString().getBytes("UTF-8"), Double.valueOf(randomGpa()).toString().getBytes("UTF-8") };
fields = f;
} else if (format.equals("voter")) {
byte[][] f = { randomName().getBytes("UTF-8"), Integer.valueOf(randomAge()).toString().getBytes("UTF-8"), randomRegistration().getBytes("UTF-8"), Double.valueOf(randomContribution()).toString().getBytes("UTF-8") };
fields = f;
} else if (format.equals("alltypes")) {
byte[][] f = { Integer.valueOf(rand.nextInt(Byte.MAX_VALUE)).toString().getBytes("UTF-8"), Integer.valueOf(rand.nextInt(Short.MAX_VALUE)).toString().getBytes("UTF-8"), Integer.valueOf(rand.nextInt()).toString().getBytes("UTF-8"), Long.valueOf(rand.nextLong()).toString().getBytes("UTF-8"), Float.valueOf(rand.nextFloat() * 1000).toString().getBytes("UTF-8"), Double.valueOf(rand.nextDouble() * 1000000).toString().getBytes("UTF-8"), randomName().getBytes("UTF-8"), randomMap(), randomArray() };
fields = f;
}
for (int i = 0; i < fields.length; i++) {
BytesRefWritable field = new BytesRefWritable(fields[i], 0, fields[i].length);
row.set(i, field);
pw.print(new String(fields[i]));
if (i != fields.length - 1)
pw.print("\t");
else
pw.println();
}
writer.append(row);
}
writer.close();
pw.close();
}
use of org.apache.hadoop.io.compress.DefaultCodec in project hive by apache.
the class TestRCFileCat method testRCFileCat.
/**
* test parse file
*/
@Test
public void testRCFileCat() throws Exception {
File template = File.createTempFile("hive", "tmpTest");
Configuration configuration = new Configuration();
byte[][] record_1 = { Bytes.toBytes("123"), Bytes.toBytes("456"), Bytes.toBytes("789"), Bytes.toBytes("1000"), Bytes.toBytes("5.3"), Bytes.toBytes("hive and hadoop"), new byte[0], Bytes.toBytes("NULL") };
byte[][] record_2 = { Bytes.toBytes("100"), Bytes.toBytes("200"), Bytes.toBytes("123"), Bytes.toBytes("1000"), Bytes.toBytes("5.3"), Bytes.toBytes("hive and hadoop"), new byte[0], Bytes.toBytes("NULL") };
byte[][] record_3 = { Bytes.toBytes("200"), Bytes.toBytes("400"), Bytes.toBytes("678"), Bytes.toBytes("1000"), Bytes.toBytes("4.8"), Bytes.toBytes("hive and hadoop"), new byte[0], Bytes.toBytes("TEST") };
RCFileOutputFormat.setColumnNumber(configuration, 8);
Path file = new Path(template.getAbsolutePath());
FileSystem fs = FileSystem.getLocal(configuration);
RCFile.Writer writer = new RCFile.Writer(fs, configuration, file, null, RCFile.createMetadata(new Text("apple"), new Text("block"), new Text("cat"), new Text("dog")), new DefaultCodec());
write(writer, record_1);
write(writer, record_2);
write(writer, record_3);
writer.close();
RCFileCat fileCat = new RCFileCat();
fileCat.test = true;
fileCat.setConf(new Configuration());
// set fake input and output streams
PrintStream oldOutPrintStream = System.out;
PrintStream oldErrPrintStream = System.err;
ByteArrayOutputStream dataOut = new ByteArrayOutputStream();
ByteArrayOutputStream dataErr = new ByteArrayOutputStream();
System.setOut(new PrintStream(dataOut));
System.setErr(new PrintStream(dataErr));
try {
String[] params = { "--verbose", "file://" + template.toURI().getPath() };
assertEquals(0, fileCat.run(params));
assertTrue(dataOut.toString().contains("123\t456\t789\t1000\t5.3\thive and hadoop\t\tNULL"));
assertTrue(dataOut.toString().contains("100\t200\t123\t1000\t5.3\thive and hadoop\t\tNULL"));
assertTrue(dataOut.toString().contains("200\t400\t678\t1000\t4.8\thive and hadoop\t\tTEST"));
dataOut.reset();
params = new String[] { "--start=-10", "--file-sizes", "file://" + template.toURI().getPath() };
assertEquals(0, fileCat.run(params));
assertTrue(dataOut.toString().contains("File size (uncompressed): 105. File size (compressed): 134. Number of rows: 3."));
dataOut.reset();
params = new String[] { "--start=0", "--column-sizes", "file://" + template.toURI().getPath() };
assertEquals(0, fileCat.run(params));
assertTrue(dataOut.toString().contains("0\t9\t17"));
assertTrue(dataOut.toString().contains("1\t9\t17"));
assertTrue(dataOut.toString().contains("2\t9\t17"));
assertTrue(dataOut.toString().contains("3\t12\t14"));
assertTrue(dataOut.toString().contains("4\t9\t17"));
assertTrue(dataOut.toString().contains("5\t45\t26"));
dataOut.reset();
params = new String[] { "--start=0", "--column-sizes-pretty", "file://" + template.toURI().getPath() };
assertEquals(0, fileCat.run(params));
assertTrue(dataOut.toString().contains("Column 0: Uncompressed size: 9 Compressed size: 17"));
assertTrue(dataOut.toString().contains("Column 1: Uncompressed size: 9 Compressed size: 17"));
assertTrue(dataOut.toString().contains("Column 2: Uncompressed size: 9 Compressed size: 17"));
assertTrue(dataOut.toString().contains("Column 3: Uncompressed size: 12 Compressed size: 14"));
assertTrue(dataOut.toString().contains("Column 4: Uncompressed size: 9 Compressed size: 17"));
assertTrue(dataOut.toString().contains("Column 5: Uncompressed size: 45 Compressed size: 26"));
params = new String[] {};
assertEquals(-1, fileCat.run(params));
assertTrue(dataErr.toString().contains("RCFileCat [--start=start_offet] [--length=len] [--verbose] " + "[--column-sizes | --column-sizes-pretty] [--file-sizes] fileName"));
dataErr.reset();
params = new String[] { "--fakeParameter", "file://" + template.toURI().getPath() };
assertEquals(-1, fileCat.run(params));
assertTrue(dataErr.toString().contains("RCFileCat [--start=start_offet] [--length=len] [--verbose] " + "[--column-sizes | --column-sizes-pretty] [--file-sizes] fileName"));
} finally {
// restore input and output streams
System.setOut(oldOutPrintStream);
System.setErr(oldErrPrintStream);
}
}
use of org.apache.hadoop.io.compress.DefaultCodec in project hive by apache.
the class TestRCFile method testGetColumn.
/**
* Tests {@link RCFile.Reader#getColumn(int, BytesRefArrayWritable) } method.
* @throws IOException
*/
@Test
public void testGetColumn() throws IOException {
cleanup();
RCFileOutputFormat.setColumnNumber(conf, expectedFieldsData.length);
RCFile.Writer writer = new RCFile.Writer(fs, conf, file, null, RCFile.createMetadata(new Text("apple"), new Text("block"), new Text("cat"), new Text("dog")), new DefaultCodec());
byte[][] record_1 = { "123".getBytes("UTF-8"), "456".getBytes("UTF-8"), "789".getBytes("UTF-8"), "1000".getBytes("UTF-8"), "5.3".getBytes("UTF-8"), "hive and hadoop".getBytes("UTF-8"), new byte[0], "NULL".getBytes("UTF-8") };
byte[][] record_2 = { "100".getBytes("UTF-8"), "200".getBytes("UTF-8"), "123".getBytes("UTF-8"), "1000".getBytes("UTF-8"), "5.3".getBytes("UTF-8"), "hive and hadoop".getBytes("UTF-8"), new byte[0], "NULL".getBytes("UTF-8") };
BytesRefArrayWritable bytes = new BytesRefArrayWritable(record_1.length);
for (int i = 0; i < record_1.length; i++) {
BytesRefWritable cu = new BytesRefWritable(record_1[i], 0, record_1[i].length);
bytes.set(i, cu);
}
writer.append(bytes);
bytes.clear();
for (int i = 0; i < record_2.length; i++) {
BytesRefWritable cu = new BytesRefWritable(record_2[i], 0, record_2[i].length);
bytes.set(i, cu);
}
writer.append(bytes);
writer.close();
RCFile.Reader reader = new RCFile.Reader(fs, file, conf);
LongWritable rowID = new LongWritable();
assertTrue(reader.next(rowID));
assertEquals(rowID.get(), 0L);
assertTrue(reader.next(rowID));
assertEquals(rowID.get(), 1L);
BytesRefArrayWritable result = null;
BytesRefWritable brw;
for (int col = 0; col < 8; col++) {
BytesRefArrayWritable result2 = reader.getColumn(col, result);
if (result == null) {
assertNotNull(result2);
result = result2;
} else {
// #getColumn(2) should return the instance passed in:
assertSame(result2, result);
}
// each column has height of 2:
assertEquals(2, result.size());
for (int row = 0; row < result.size(); row++) {
brw = result.get(row);
int start = brw.getStart();
int len = brw.getLength();
byte[] actualData = Arrays.copyOfRange(brw.getData(), start, start + len);
byte[] expectedData = (row == 0) ? record_1[col] : record_2[col];
assertArrayEquals("col=" + col + " : row=" + row, expectedData, actualData);
}
result.clear();
}
reader.close();
}
use of org.apache.hadoop.io.compress.DefaultCodec in project hive by apache.
the class TestRCFile method testReadCorruptFile.
@Test
public void testReadCorruptFile() throws IOException, SerDeException {
cleanup();
byte[][] record = { null, null, null, null, null, null, null, null };
RCFileOutputFormat.setColumnNumber(conf, expectedFieldsData.length);
RCFile.Writer writer = new RCFile.Writer(fs, conf, file, null, new DefaultCodec());
BytesRefArrayWritable bytes = new BytesRefArrayWritable(record.length);
final int recCount = 100;
Random rand = new Random();
for (int recIdx = 0; recIdx < recCount; recIdx++) {
for (int i = 0; i < record.length; i++) {
record[i] = new Integer(rand.nextInt()).toString().getBytes("UTF-8");
}
for (int i = 0; i < record.length; i++) {
BytesRefWritable cu = new BytesRefWritable(record[i], 0, record[i].length);
bytes.set(i, cu);
}
writer.append(bytes);
bytes.clear();
}
writer.close();
// Insert junk in middle of file. Assumes file is on local disk.
RandomAccessFile raf = new RandomAccessFile(file.toUri().getPath(), "rw");
long corruptOffset = raf.length() / 2;
LOG.info("corrupting " + raf + " at offset " + corruptOffset);
raf.seek(corruptOffset);
raf.writeBytes("junkjunkjunkjunkjunkjunkjunkjunk");
raf.close();
// Set the option for tolerating corruptions. The read should succeed.
Configuration tmpConf = new Configuration(conf);
tmpConf.setBoolean("hive.io.rcfile.tolerate.corruptions", true);
RCFile.Reader reader = new RCFile.Reader(fs, file, tmpConf);
LongWritable rowID = new LongWritable();
while (true) {
boolean more = reader.next(rowID);
if (!more) {
break;
}
BytesRefArrayWritable cols = new BytesRefArrayWritable();
reader.getCurrentRow(cols);
cols.resetValid(8);
}
reader.close();
}
use of org.apache.hadoop.io.compress.DefaultCodec in project hadoop by apache.
the class TestSequenceFileAppend method testAppend.
@Test(timeout = 30000)
public void testAppend() throws Exception {
Path file = new Path(ROOT_PATH, "testseqappend.seq");
fs.delete(file, true);
Text key1 = new Text("Key1");
Text value1 = new Text("Value1");
Text value2 = new Text("Updated");
SequenceFile.Metadata metadata = new SequenceFile.Metadata();
metadata.set(key1, value1);
Writer.Option metadataOption = Writer.metadata(metadata);
Writer writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(file), SequenceFile.Writer.keyClass(Long.class), SequenceFile.Writer.valueClass(String.class), metadataOption);
writer.append(1L, "one");
writer.append(2L, "two");
writer.close();
verify2Values(file);
metadata.set(key1, value2);
writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(file), SequenceFile.Writer.keyClass(Long.class), SequenceFile.Writer.valueClass(String.class), SequenceFile.Writer.appendIfExists(true), metadataOption);
// Verify the Meta data is not changed
assertEquals(value1, writer.metadata.get(key1));
writer.append(3L, "three");
writer.append(4L, "four");
writer.close();
verifyAll4Values(file);
// Verify the Meta data readable after append
Reader reader = new Reader(conf, Reader.file(file));
assertEquals(value1, reader.getMetadata().get(key1));
reader.close();
// Verify failure if the compression details are different
try {
Option wrongCompressOption = Writer.compression(CompressionType.RECORD, new GzipCodec());
writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(file), SequenceFile.Writer.keyClass(Long.class), SequenceFile.Writer.valueClass(String.class), SequenceFile.Writer.appendIfExists(true), wrongCompressOption);
writer.close();
fail("Expected IllegalArgumentException for compression options");
} catch (IllegalArgumentException IAE) {
// Expected exception. Ignore it
}
try {
Option wrongCompressOption = Writer.compression(CompressionType.BLOCK, new DefaultCodec());
writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(file), SequenceFile.Writer.keyClass(Long.class), SequenceFile.Writer.valueClass(String.class), SequenceFile.Writer.appendIfExists(true), wrongCompressOption);
writer.close();
fail("Expected IllegalArgumentException for compression options");
} catch (IllegalArgumentException IAE) {
// Expected exception. Ignore it
}
fs.deleteOnExit(file);
}
Aggregations