Search in sources :

Example 1 with PigServer

use of org.apache.pig.PigServer in project druid by druid-io.

the class AvroHadoopInputRowParserTest method buildPigAvro.

private static GenericRecord buildPigAvro(GenericRecord datum, String inputStorage, String outputStorage) throws IOException {
    final File tmpDir = Files.createTempDir();
    FileReader<GenericRecord> reader = null;
    PigServer pigServer = null;
    try {
        // 0. write avro object into temp file.
        File someAvroDatumFile = new File(tmpDir, "someAvroDatum.avro");
        DataFileWriter<GenericRecord> dataFileWriter = new DataFileWriter<GenericRecord>(new GenericDatumWriter<GenericRecord>());
        dataFileWriter.create(SomeAvroDatum.getClassSchema(), someAvroDatumFile);
        dataFileWriter.append(datum);
        dataFileWriter.close();
        // 1. read avro files into Pig
        pigServer = new PigServer(ExecType.LOCAL);
        pigServer.registerQuery(String.format("A = LOAD '%s' USING %s;", someAvroDatumFile, inputStorage));
        // 2. write new avro file using AvroStorage
        File outputDir = new File(tmpDir, "output");
        pigServer.store("A", String.valueOf(outputDir), outputStorage);
        // 3. read avro object from AvroStorage
        reader = DataFileReader.openReader(new File(outputDir, "part-m-00000.avro"), new GenericDatumReader<GenericRecord>());
        return reader.next();
    } finally {
        if (pigServer != null) {
            pigServer.shutdown();
        }
        Closeables.close(reader, true);
        FileUtils.deleteDirectory(tmpDir);
    }
}
Also used : PigServer(org.apache.pig.PigServer) GenericDatumReader(org.apache.avro.generic.GenericDatumReader) DataFileWriter(org.apache.avro.file.DataFileWriter) GenericRecord(org.apache.avro.generic.GenericRecord) File(java.io.File)

Example 2 with PigServer

use of org.apache.pig.PigServer in project hive by apache.

the class HCatBaseTest method createPigServer.

/**
 * creates PigServer in LOCAL mode.
 * http://pig.apache.org/docs/r0.12.0/perf.html#error-handling
 * @param stopOnFailure equivalent of "-stop_on_failure" command line arg, setting to 'true' makes
 *                      debugging easier
 */
public static PigServer createPigServer(boolean stopOnFailure) throws ExecException {
    if (stopOnFailure) {
        Properties p = new Properties();
        p.put("stop.on.failure", Boolean.TRUE.toString());
        return new PigServer(ExecType.LOCAL, p);
    }
    return new PigServer(ExecType.LOCAL);
}
Also used : PigServer(org.apache.pig.PigServer) Properties(java.util.Properties)

Example 3 with PigServer

use of org.apache.pig.PigServer in project Resource by lovelifeming.

the class PigOperator method excutePig.

public static void excutePig(String execTypeString, String jarPath, String input, String output) throws IOException {
    PigServer pigServer = new PigServer(execTypeString);
    pigServer.registerJar(jarPath);
    // String input = "/opt/sf/input.txt";
    // String output = "/opt/sf/output.txt";
    pigServer.registerQuery("A = load'" + input + "' using TextLoader();");
    pigServer.registerQuery("B = foreach A generate flatten(tokenize($0));");
    pigServer.registerQuery("C = group B by $1");
    pigServer.registerQuery("D = foreach C generate flatten(group),COUNT(B.$0)");
    pigServer.store("D", output);
}
Also used : PigServer(org.apache.pig.PigServer)

Example 4 with PigServer

use of org.apache.pig.PigServer in project elephant-bird by twitter.

the class TestJsonLoader method testPigScript.

@Test
public void testPigScript() throws IOException {
    File tempFile = File.createTempFile("json", null);
    tempFile.deleteOnExit();
    FileWriter writer = new FileWriter(tempFile);
    writer.write("{\"score\": 10}\n");
    writer.write("{\"score\": 20}\n");
    writer.write("{\"score\": 30}\n");
    writer.close();
    PigServer pigServer = PigTestUtil.makePigServer();
    logAndRegisterQuery(pigServer, "data = load '" + tempFile.getAbsolutePath() + "' using com.twitter.elephantbird.pig.load.JsonLoader() as (json: map[]);");
    logAndRegisterQuery(pigServer, "a = foreach data generate (int) json#'score' as score;");
    logAndRegisterQuery(pigServer, "b = group a all;");
    logAndRegisterQuery(pigServer, "c = foreach b generate SUM(a.score) as total_score;");
    Iterator<Tuple> tuples = pigServer.openIterator("c");
    int count = 0;
    while (tuples.hasNext()) {
        Tuple t = tuples.next();
        // expected sum of scores
        Assert.assertEquals(new Long(60), t.get(0));
        count++;
    }
    // expect just one tuple
    Assert.assertEquals(1, count);
}
Also used : PigServer(org.apache.pig.PigServer) FileWriter(java.io.FileWriter) File(java.io.File) Tuple(org.apache.pig.data.Tuple) Test(org.junit.Test)

Example 5 with PigServer

use of org.apache.pig.PigServer in project elephant-bird by twitter.

the class TestJsonLoader method testNestedLoad.

@Test
public void testNestedLoad() throws IOException {
    File tempFile = File.createTempFile("json", null);
    tempFile.deleteOnExit();
    FileWriter writer = new FileWriter(tempFile);
    // json structure as in Twitter Streaming
    writer.write("{" + "  \"entities\": {" + "    \"hashtags\": [" + "      {\"indices\": [0,0], \"text\": \"test1\"}," + "      {\"indices\": [0,0], \"text\": \"test2\"}" + "    ]," + "    \"user_mentions\": []," + "    \"urls\": []" + "  }" + "}");
    writer.close();
    // extract hashtags from it
    PigServer pigServer = PigTestUtil.makePigServer();
    // enable nested load
    pigServer.getPigContext().getProperties().setProperty(JsonLoader.NESTED_LOAD_KEY, "true");
    logAndRegisterQuery(pigServer, "data = load '" + tempFile.getAbsolutePath() + "' using com.twitter.elephantbird.pig.load.JsonLoader() as (json: map[]);");
    logAndRegisterQuery(pigServer, "a = foreach data generate json#'entities'#'hashtags' as h;");
    logAndRegisterQuery(pigServer, "b = foreach a generate flatten(h) as h;");
    logAndRegisterQuery(pigServer, "c = foreach b generate h#'text' as h;");
    Iterator<Tuple> tuples = pigServer.openIterator("c");
    int count = 0;
    String[] hashtags = { "test1", "test2" };
    while (tuples.hasNext()) {
        Tuple t = tuples.next();
        Assert.assertEquals(hashtags[count], t.get(0).toString());
        count++;
    }
    // expect two tuples
    Assert.assertEquals(2, count);
}
Also used : PigServer(org.apache.pig.PigServer) FileWriter(java.io.FileWriter) File(java.io.File) Tuple(org.apache.pig.data.Tuple) Test(org.junit.Test)

Aggregations

PigServer (org.apache.pig.PigServer)115 Tuple (org.apache.pig.data.Tuple)74 ArrayList (java.util.ArrayList)70 Test (org.junit.Test)59 HCatBaseTest (org.apache.hive.hcatalog.mapreduce.HCatBaseTest)37 Data (org.apache.pig.builtin.mock.Storage.Data)15 File (java.io.File)14 Schema (org.apache.pig.impl.logicalLayer.schema.Schema)14 FieldSchema (org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema)9 Properties (java.util.Properties)8 Vector (java.util.Vector)8 HCatFieldSchema (org.apache.hive.hcatalog.data.schema.HCatFieldSchema)6 Path (org.apache.hadoop.fs.Path)4 FileWriter (java.io.FileWriter)3 List (java.util.List)3 Map (java.util.Map)3 Admin (org.apache.hadoop.hbase.client.Admin)3 Connection (org.apache.hadoop.hbase.client.Connection)3 Pair (org.apache.hive.hcatalog.data.Pair)3 ExecJob (org.apache.pig.backend.executionengine.ExecJob)3