Search in sources :

Example 56 with PigServer

use of org.apache.pig.PigServer in project elephant-bird by twitter.

the class TestJsonLoader method testNestedLoad.

@Test
public void testNestedLoad() throws IOException {
    File tempFile = File.createTempFile("json", null);
    tempFile.deleteOnExit();
    FileWriter writer = new FileWriter(tempFile);
    // json structure as in Twitter Streaming
    writer.write("{" + "  \"entities\": {" + "    \"hashtags\": [" + "      {\"indices\": [0,0], \"text\": \"test1\"}," + "      {\"indices\": [0,0], \"text\": \"test2\"}" + "    ]," + "    \"user_mentions\": []," + "    \"urls\": []" + "  }" + "}");
    writer.close();
    // extract hashtags from it
    PigServer pigServer = PigTestUtil.makePigServer();
    // enable nested load
    pigServer.getPigContext().getProperties().setProperty(JsonLoader.NESTED_LOAD_KEY, "true");
    logAndRegisterQuery(pigServer, "data = load '" + tempFile.getAbsolutePath() + "' using com.twitter.elephantbird.pig.load.JsonLoader() as (json: map[]);");
    logAndRegisterQuery(pigServer, "a = foreach data generate json#'entities'#'hashtags' as h;");
    logAndRegisterQuery(pigServer, "b = foreach a generate flatten(h) as h;");
    logAndRegisterQuery(pigServer, "c = foreach b generate h#'text' as h;");
    Iterator<Tuple> tuples = pigServer.openIterator("c");
    int count = 0;
    String[] hashtags = { "test1", "test2" };
    while (tuples.hasNext()) {
        Tuple t = tuples.next();
        Assert.assertEquals(hashtags[count], t.get(0).toString());
        count++;
    }
    // expect two tuples
    Assert.assertEquals(2, count);
}
Also used : PigServer(org.apache.pig.PigServer) FileWriter(java.io.FileWriter) File(java.io.File) Tuple(org.apache.pig.data.Tuple) Test(org.junit.Test)

Example 57 with PigServer

use of org.apache.pig.PigServer in project elephant-bird by twitter.

the class PigTestUtil method makePigServer.

/**
 * Creates a new PigServer in local mode.
 * Sets pig properties for lzo codec and temp directory.
 */
public static PigServer makePigServer() throws ExecException {
    PigServer pigServer = new PigServer(ExecType.LOCAL);
    // set lzo codec:
    pigServer.getPigContext().getProperties().setProperty("io.compression.codecs", "com.hadoop.compression.lzo.LzopCodec");
    pigServer.getPigContext().getProperties().setProperty("pig.temp.dir", System.getProperty("test.build.data") + "/pig-temp");
    return pigServer;
}
Also used : PigServer(org.apache.pig.PigServer)

Example 58 with PigServer

use of org.apache.pig.PigServer in project parquet-mr by apache.

the class PerfTest method load.

private static void load(String out, int colsToLoad) throws ExecException, IOException {
    long t0 = System.currentTimeMillis();
    StringBuilder schemaString = new StringBuilder("a0: chararray");
    for (int i = 1; i < colsToLoad; i++) {
        schemaString.append(", a" + i + ": chararray");
    }
    PigServer pigServer = new PigServer(ExecType.LOCAL);
    pigServer.registerQuery("B = LOAD '" + out + "' USING " + ParquetLoader.class.getName() + "('" + schemaString + "');");
    pigServer.registerQuery("C = FOREACH (GROUP B ALL) GENERATE COUNT(B);");
    Iterator<Tuple> it = pigServer.openIterator("C");
    if (!it.hasNext()) {
        throw new RuntimeException("Job failed: no tuple to read");
    }
    Long count = (Long) it.next().get(0);
    assertEquals(ROW_COUNT, count.longValue());
    long t1 = System.currentTimeMillis();
    results.append((t1 - t0) + " ms to read " + colsToLoad + " columns\n");
}
Also used : PigServer(org.apache.pig.PigServer) Tuple(org.apache.pig.data.Tuple)

Example 59 with PigServer

use of org.apache.pig.PigServer in project parquet-mr by apache.

the class PerfTest method main.

public static void main(String[] args) throws Exception {
    StringBuilder schemaString = new StringBuilder("a0: chararray");
    for (int i = 1; i < COLUMN_COUNT; i++) {
        schemaString.append(", a" + i + ": chararray");
    }
    String out = "target/PerfTest";
    {
        PigServer pigServer = new PigServer(ExecType.LOCAL);
        Data data = Storage.resetData(pigServer);
        Collection<Tuple> list = new ArrayList<Tuple>();
        for (int i = 0; i < ROW_COUNT; i++) {
            Tuple tuple = TupleFactory.getInstance().newTuple(COLUMN_COUNT);
            for (int j = 0; j < COLUMN_COUNT; j++) {
                tuple.set(j, "a" + i + "_" + j);
            }
            list.add(tuple);
        }
        data.set("in", schemaString.toString(), list);
        pigServer.setBatchOn();
        pigServer.registerQuery("A = LOAD 'in' USING mock.Storage();");
        pigServer.deleteFile(out);
        pigServer.registerQuery("Store A into '" + out + "' using " + ParquetStorer.class.getName() + "();");
        if (pigServer.executeBatch().get(0).getStatus() != JOB_STATUS.COMPLETED) {
            throw new RuntimeException("Job failed", pigServer.executeBatch().get(0).getException());
        }
    }
    load(out, 1);
    load(out, 2);
    load(out, 3);
    load(out, 4);
    load(out, 5);
    load(out, 10);
    load(out, 20);
    load(out, 50);
    System.out.println(results);
}
Also used : PigServer(org.apache.pig.PigServer) Collection(java.util.Collection) Data(org.apache.pig.builtin.mock.Storage.Data) Tuple(org.apache.pig.data.Tuple)

Example 60 with PigServer

use of org.apache.pig.PigServer in project parquet-mr by apache.

the class TestParquetLoader method testColumnIndexAccessProjection.

@Test
public void testColumnIndexAccessProjection() throws Exception {
    PigServer pigServer = new PigServer(ExecType.LOCAL);
    pigServer.setValidateEachStatement(true);
    String out = "target/out";
    int rows = 10;
    Data data = Storage.resetData(pigServer);
    List<Tuple> list = new ArrayList<Tuple>();
    for (int i = 0; i < rows; i++) {
        list.add(Storage.tuple(i, i * 1.0, i * 2L, "v" + i));
    }
    data.set("in", "c1:int, c2:double, c3:long, c4:chararray", list);
    pigServer.setBatchOn();
    pigServer.registerQuery("A = LOAD 'in' USING mock.Storage();");
    pigServer.deleteFile(out);
    pigServer.registerQuery("Store A into '" + out + "' using " + ParquetStorer.class.getName() + "();");
    pigServer.executeBatch();
    pigServer.registerQuery("B = LOAD '" + out + "' using " + ParquetLoader.class.getName() + "('n1:int, n2:double, n3:long, n4:chararray', 'true');");
    pigServer.registerQuery("C = foreach B generate n1, n3;");
    pigServer.registerQuery("STORE C into 'out' using mock.Storage();");
    pigServer.executeBatch();
    List<Tuple> actualList = data.get("out");
    assertEquals(rows, actualList.size());
    for (int i = 0; i < rows; i++) {
        Tuple t = actualList.get(i);
        assertEquals(2, t.size());
        assertEquals(i, t.get(0));
        assertEquals(i * 2L, t.get(1));
    }
}
Also used : PigServer(org.apache.pig.PigServer) ArrayList(java.util.ArrayList) Data(org.apache.pig.builtin.mock.Storage.Data) Tuple(org.apache.pig.data.Tuple) Test(org.junit.Test)

Aggregations

PigServer (org.apache.pig.PigServer)115 Tuple (org.apache.pig.data.Tuple)74 ArrayList (java.util.ArrayList)70 Test (org.junit.Test)59 HCatBaseTest (org.apache.hive.hcatalog.mapreduce.HCatBaseTest)37 Data (org.apache.pig.builtin.mock.Storage.Data)15 File (java.io.File)14 Schema (org.apache.pig.impl.logicalLayer.schema.Schema)14 FieldSchema (org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema)9 Properties (java.util.Properties)8 Vector (java.util.Vector)8 HCatFieldSchema (org.apache.hive.hcatalog.data.schema.HCatFieldSchema)6 Path (org.apache.hadoop.fs.Path)4 FileWriter (java.io.FileWriter)3 List (java.util.List)3 Map (java.util.Map)3 Admin (org.apache.hadoop.hbase.client.Admin)3 Connection (org.apache.hadoop.hbase.client.Connection)3 Pair (org.apache.hive.hcatalog.data.Pair)3 ExecJob (org.apache.pig.backend.executionengine.ExecJob)3