use of org.apache.pig.PigServer in project elephant-bird by twitter.
the class TestJsonLoader method testNestedLoad.
@Test
public void testNestedLoad() throws IOException {
File tempFile = File.createTempFile("json", null);
tempFile.deleteOnExit();
FileWriter writer = new FileWriter(tempFile);
// json structure as in Twitter Streaming
writer.write("{" + " \"entities\": {" + " \"hashtags\": [" + " {\"indices\": [0,0], \"text\": \"test1\"}," + " {\"indices\": [0,0], \"text\": \"test2\"}" + " ]," + " \"user_mentions\": []," + " \"urls\": []" + " }" + "}");
writer.close();
// extract hashtags from it
PigServer pigServer = PigTestUtil.makePigServer();
// enable nested load
pigServer.getPigContext().getProperties().setProperty(JsonLoader.NESTED_LOAD_KEY, "true");
logAndRegisterQuery(pigServer, "data = load '" + tempFile.getAbsolutePath() + "' using com.twitter.elephantbird.pig.load.JsonLoader() as (json: map[]);");
logAndRegisterQuery(pigServer, "a = foreach data generate json#'entities'#'hashtags' as h;");
logAndRegisterQuery(pigServer, "b = foreach a generate flatten(h) as h;");
logAndRegisterQuery(pigServer, "c = foreach b generate h#'text' as h;");
Iterator<Tuple> tuples = pigServer.openIterator("c");
int count = 0;
String[] hashtags = { "test1", "test2" };
while (tuples.hasNext()) {
Tuple t = tuples.next();
Assert.assertEquals(hashtags[count], t.get(0).toString());
count++;
}
// expect two tuples
Assert.assertEquals(2, count);
}
use of org.apache.pig.PigServer in project elephant-bird by twitter.
the class PigTestUtil method makePigServer.
/**
* Creates a new PigServer in local mode.
* Sets pig properties for lzo codec and temp directory.
*/
public static PigServer makePigServer() throws ExecException {
PigServer pigServer = new PigServer(ExecType.LOCAL);
// set lzo codec:
pigServer.getPigContext().getProperties().setProperty("io.compression.codecs", "com.hadoop.compression.lzo.LzopCodec");
pigServer.getPigContext().getProperties().setProperty("pig.temp.dir", System.getProperty("test.build.data") + "/pig-temp");
return pigServer;
}
use of org.apache.pig.PigServer in project parquet-mr by apache.
the class PerfTest method load.
private static void load(String out, int colsToLoad) throws ExecException, IOException {
long t0 = System.currentTimeMillis();
StringBuilder schemaString = new StringBuilder("a0: chararray");
for (int i = 1; i < colsToLoad; i++) {
schemaString.append(", a" + i + ": chararray");
}
PigServer pigServer = new PigServer(ExecType.LOCAL);
pigServer.registerQuery("B = LOAD '" + out + "' USING " + ParquetLoader.class.getName() + "('" + schemaString + "');");
pigServer.registerQuery("C = FOREACH (GROUP B ALL) GENERATE COUNT(B);");
Iterator<Tuple> it = pigServer.openIterator("C");
if (!it.hasNext()) {
throw new RuntimeException("Job failed: no tuple to read");
}
Long count = (Long) it.next().get(0);
assertEquals(ROW_COUNT, count.longValue());
long t1 = System.currentTimeMillis();
results.append((t1 - t0) + " ms to read " + colsToLoad + " columns\n");
}
use of org.apache.pig.PigServer in project parquet-mr by apache.
the class PerfTest method main.
public static void main(String[] args) throws Exception {
StringBuilder schemaString = new StringBuilder("a0: chararray");
for (int i = 1; i < COLUMN_COUNT; i++) {
schemaString.append(", a" + i + ": chararray");
}
String out = "target/PerfTest";
{
PigServer pigServer = new PigServer(ExecType.LOCAL);
Data data = Storage.resetData(pigServer);
Collection<Tuple> list = new ArrayList<Tuple>();
for (int i = 0; i < ROW_COUNT; i++) {
Tuple tuple = TupleFactory.getInstance().newTuple(COLUMN_COUNT);
for (int j = 0; j < COLUMN_COUNT; j++) {
tuple.set(j, "a" + i + "_" + j);
}
list.add(tuple);
}
data.set("in", schemaString.toString(), list);
pigServer.setBatchOn();
pigServer.registerQuery("A = LOAD 'in' USING mock.Storage();");
pigServer.deleteFile(out);
pigServer.registerQuery("Store A into '" + out + "' using " + ParquetStorer.class.getName() + "();");
if (pigServer.executeBatch().get(0).getStatus() != JOB_STATUS.COMPLETED) {
throw new RuntimeException("Job failed", pigServer.executeBatch().get(0).getException());
}
}
load(out, 1);
load(out, 2);
load(out, 3);
load(out, 4);
load(out, 5);
load(out, 10);
load(out, 20);
load(out, 50);
System.out.println(results);
}
use of org.apache.pig.PigServer in project parquet-mr by apache.
the class TestParquetLoader method testColumnIndexAccessProjection.
@Test
public void testColumnIndexAccessProjection() throws Exception {
PigServer pigServer = new PigServer(ExecType.LOCAL);
pigServer.setValidateEachStatement(true);
String out = "target/out";
int rows = 10;
Data data = Storage.resetData(pigServer);
List<Tuple> list = new ArrayList<Tuple>();
for (int i = 0; i < rows; i++) {
list.add(Storage.tuple(i, i * 1.0, i * 2L, "v" + i));
}
data.set("in", "c1:int, c2:double, c3:long, c4:chararray", list);
pigServer.setBatchOn();
pigServer.registerQuery("A = LOAD 'in' USING mock.Storage();");
pigServer.deleteFile(out);
pigServer.registerQuery("Store A into '" + out + "' using " + ParquetStorer.class.getName() + "();");
pigServer.executeBatch();
pigServer.registerQuery("B = LOAD '" + out + "' using " + ParquetLoader.class.getName() + "('n1:int, n2:double, n3:long, n4:chararray', 'true');");
pigServer.registerQuery("C = foreach B generate n1, n3;");
pigServer.registerQuery("STORE C into 'out' using mock.Storage();");
pigServer.executeBatch();
List<Tuple> actualList = data.get("out");
assertEquals(rows, actualList.size());
for (int i = 0; i < rows; i++) {
Tuple t = actualList.get(i);
assertEquals(2, t.size());
assertEquals(i, t.get(0));
assertEquals(i * 2L, t.get(1));
}
}
Aggregations