Search in sources :

Example 26 with HCatSchema

use of org.apache.hive.hcatalog.data.schema.HCatSchema in project hive by apache.

the class TestE2EScenarios method copyTable.

private void copyTable(String in, String out) throws IOException, InterruptedException {
    Job ijob = new Job();
    Job ojob = new Job();
    HCatInputFormat inpy = new HCatInputFormat();
    inpy.setInput(ijob, null, in);
    HCatOutputFormat oupy = new HCatOutputFormat();
    oupy.setOutput(ojob, OutputJobInfo.create(null, out, new HashMap<String, String>()));
    // Test HCatContext
    System.err.println("HCatContext INSTANCE is present : " + HCatContext.INSTANCE.getConf().isPresent());
    if (HCatContext.INSTANCE.getConf().isPresent()) {
        System.err.println("HCatContext tinyint->int promotion says " + HCatContext.INSTANCE.getConf().get().getBoolean(HCatConstants.HCAT_DATA_TINY_SMALL_INT_PROMOTION, HCatConstants.HCAT_DATA_TINY_SMALL_INT_PROMOTION_DEFAULT));
    }
    HCatSchema tableSchema = inpy.getTableSchema(ijob.getConfiguration());
    System.err.println("Copying from [" + in + "] to [" + out + "] with schema : " + tableSchema.toString());
    oupy.setSchema(ojob, tableSchema);
    oupy.checkOutputSpecs(ojob);
    OutputCommitter oc = oupy.getOutputCommitter(createTaskAttemptContext(ojob.getConfiguration()));
    oc.setupJob(ojob);
    for (InputSplit split : inpy.getSplits(ijob)) {
        TaskAttemptContext rtaskContext = createTaskAttemptContext(ijob.getConfiguration());
        TaskAttemptContext wtaskContext = createTaskAttemptContext(ojob.getConfiguration());
        RecordReader<WritableComparable, HCatRecord> rr = inpy.createRecordReader(split, rtaskContext);
        rr.initialize(split, rtaskContext);
        OutputCommitter taskOc = oupy.getOutputCommitter(wtaskContext);
        taskOc.setupTask(wtaskContext);
        RecordWriter<WritableComparable<?>, HCatRecord> rw = oupy.getRecordWriter(wtaskContext);
        while (rr.nextKeyValue()) {
            rw.write(rr.getCurrentKey(), rr.getCurrentValue());
        }
        rw.close(wtaskContext);
        taskOc.commitTask(wtaskContext);
        rr.close();
    }
    oc.commitJob(ojob);
}
Also used : OutputCommitter(org.apache.hadoop.mapreduce.OutputCommitter) HCatSchema(org.apache.hive.hcatalog.data.schema.HCatSchema) HashMap(java.util.HashMap) WritableComparable(org.apache.hadoop.io.WritableComparable) HCatOutputFormat(org.apache.hive.hcatalog.mapreduce.HCatOutputFormat) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit) HCatInputFormat(org.apache.hive.hcatalog.mapreduce.HCatInputFormat) HCatRecord(org.apache.hive.hcatalog.data.HCatRecord)

Example 27 with HCatSchema

use of org.apache.hive.hcatalog.data.schema.HCatSchema in project hive by apache.

the class ReadWrite method run.

public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    args = new GenericOptionsParser(conf, args).getRemainingArgs();
    String serverUri = args[0];
    String inputTableName = args[1];
    String outputTableName = args[2];
    String dbName = null;
    String principalID = System.getProperty(HCatConstants.HCAT_METASTORE_PRINCIPAL);
    if (principalID != null)
        conf.set(HCatConstants.HCAT_METASTORE_PRINCIPAL, principalID);
    Job job = new Job(conf, "ReadWrite");
    HCatInputFormat.setInput(job, dbName, inputTableName);
    // initialize HCatOutputFormat
    job.setInputFormatClass(HCatInputFormat.class);
    job.setJarByClass(ReadWrite.class);
    job.setMapperClass(Map.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(DefaultHCatRecord.class);
    HCatOutputFormat.setOutput(job, OutputJobInfo.create(dbName, outputTableName, null));
    HCatSchema s = HCatInputFormat.getTableSchema(job);
    System.err.println("INFO: output schema explicitly set for writing:" + s);
    HCatOutputFormat.setSchema(job, s);
    job.setOutputFormatClass(HCatOutputFormat.class);
    return (job.waitForCompletion(true) ? 0 : 1);
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) HCatSchema(org.apache.hive.hcatalog.data.schema.HCatSchema) Job(org.apache.hadoop.mapreduce.Job) GenericOptionsParser(org.apache.hadoop.util.GenericOptionsParser)

Example 28 with HCatSchema

use of org.apache.hive.hcatalog.data.schema.HCatSchema in project hive by apache.

the class TestPigHCatUtil method testGetBagSubSchemaConfigured.

@Test
public void testGetBagSubSchemaConfigured() throws Exception {
    // NOTE: pig-0.8 sets client system properties by actually getting the client
    // system properties. Starting in pig-0.9 you must pass the properties in.
    // When updating our pig dependency this will need updated.
    System.setProperty(HCatConstants.HCAT_PIG_INNER_TUPLE_NAME, "t");
    System.setProperty(HCatConstants.HCAT_PIG_INNER_FIELD_NAME, "FIELDNAME_tuple");
    UDFContext.getUDFContext().setClientSystemProps(System.getProperties());
    // Define the expected schema.
    ResourceFieldSchema[] bagSubFieldSchemas = new ResourceFieldSchema[1];
    bagSubFieldSchemas[0] = new ResourceFieldSchema().setName("t").setDescription("The tuple in the bag").setType(DataType.TUPLE);
    ResourceFieldSchema[] innerTupleFieldSchemas = new ResourceFieldSchema[1];
    innerTupleFieldSchemas[0] = new ResourceFieldSchema().setName("llama_tuple").setType(DataType.CHARARRAY);
    bagSubFieldSchemas[0].setSchema(new ResourceSchema().setFields(innerTupleFieldSchemas));
    ResourceSchema expected = new ResourceSchema().setFields(bagSubFieldSchemas);
    // Get the actual converted schema.
    HCatSchema actualHCatSchema = new HCatSchema(Lists.newArrayList(new HCatFieldSchema("innerLlama", HCatFieldSchema.Type.STRING, null)));
    HCatFieldSchema actualHCatFieldSchema = new HCatFieldSchema("llama", HCatFieldSchema.Type.ARRAY, actualHCatSchema, null);
    ResourceSchema actual = PigHCatUtil.getBagSubSchema(actualHCatFieldSchema);
    Assert.assertEquals(expected.toString(), actual.toString());
    // Clean up System properties that were set by this test
    System.clearProperty(HCatConstants.HCAT_PIG_INNER_TUPLE_NAME);
    System.clearProperty(HCatConstants.HCAT_PIG_INNER_FIELD_NAME);
}
Also used : ResourceSchema(org.apache.pig.ResourceSchema) HCatSchema(org.apache.hive.hcatalog.data.schema.HCatSchema) ResourceFieldSchema(org.apache.pig.ResourceSchema.ResourceFieldSchema) HCatFieldSchema(org.apache.hive.hcatalog.data.schema.HCatFieldSchema) Test(org.junit.Test)

Example 29 with HCatSchema

use of org.apache.hive.hcatalog.data.schema.HCatSchema in project hive by apache.

the class StoreDemo method main.

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    args = new GenericOptionsParser(conf, args).getRemainingArgs();
    String[] otherArgs = new String[1];
    int j = 0;
    for (int i = 0; i < args.length; i++) {
        if (args[i].equals("-libjars")) {
            // generic options parser doesn't seem to work!
            conf.set("tmpjars", args[i + 1]);
            // skip it , the for loop will skip its value
            i = i + 1;
        } else {
            otherArgs[j++] = args[i];
        }
    }
    if (otherArgs.length != 1) {
        usage();
    }
    String serverUri = otherArgs[0];
    String tableName = NUMBERS_TABLE_NAME;
    String dbName = "default";
    Map<String, String> outputPartitionKvps = new HashMap<String, String>();
    String outputTableName = NUMBERS_PARTITIONED_TABLE_NAME;
    outputPartitionKvps.put("datestamp", "20100102");
    String principalID = System.getProperty(HCatConstants.HCAT_METASTORE_PRINCIPAL);
    if (principalID != null)
        conf.set(HCatConstants.HCAT_METASTORE_PRINCIPAL, principalID);
    Job job = new Job(conf, "storedemo");
    // initialize HCatInputFormat
    HCatInputFormat.setInput(job, dbName, tableName);
    // initialize HCatOutputFormat
    HCatOutputFormat.setOutput(job, OutputJobInfo.create(dbName, outputTableName, outputPartitionKvps));
    // test with and without specifying schema randomly
    HCatSchema s = HCatInputFormat.getTableSchema(job);
    System.err.println("INFO: output schema explicitly set for writing:" + s);
    HCatOutputFormat.setSchema(job, s);
    job.setInputFormatClass(HCatInputFormat.class);
    job.setOutputFormatClass(HCatOutputFormat.class);
    job.setJarByClass(StoreDemo.class);
    job.setMapperClass(SumMapper.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setNumReduceTasks(0);
    job.setOutputValueClass(DefaultHCatRecord.class);
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) HCatSchema(org.apache.hive.hcatalog.data.schema.HCatSchema) HashMap(java.util.HashMap) Job(org.apache.hadoop.mapreduce.Job) GenericOptionsParser(org.apache.hadoop.util.GenericOptionsParser)

Example 30 with HCatSchema

use of org.apache.hive.hcatalog.data.schema.HCatSchema in project hive by apache.

the class TypeDataCheck method run.

public int run(String[] args) {
    try {
        args = new GenericOptionsParser(conf, args).getRemainingArgs();
        String[] otherArgs = new String[5];
        int j = 0;
        for (int i = 0; i < args.length; i++) {
            if (args[i].equals("-libjars")) {
                conf.set("tmpjars", args[i + 1]);
                // skip it , the for loop will skip its value
                i = i + 1;
            } else {
                otherArgs[j++] = args[i];
            }
        }
        if (otherArgs.length != 5) {
            System.err.println("Other args:" + Arrays.asList(otherArgs));
            System.err.println("Usage: hadoop jar testudf.jar typedatacheck " + "<serveruri> <tablename> <hive types of cols + delimited> " + "<output dir> <tab|ctrla> <-libjars hive-hcat jar>\n" + "The <tab|ctrla> argument controls the output delimiter.\n" + "The hcat jar location should be specified as file://<full path to jar>\n");
            System.err.println(" The <tab|ctrla> argument controls the output delimiter.");
            System.exit(2);
        }
        String serverUri = otherArgs[0];
        String tableName = otherArgs[1];
        String schemaStr = otherArgs[2];
        String outputDir = otherArgs[3];
        String outputdelim = otherArgs[4];
        if (!outputdelim.equals("tab") && !outputdelim.equals("ctrla")) {
            System.err.println("ERROR: Specify 'tab' or 'ctrla' for output delimiter");
        }
        String dbName = "default";
        String principalID = System.getProperty(HCatConstants.HCAT_METASTORE_PRINCIPAL);
        if (principalID != null) {
            conf.set(HCatConstants.HCAT_METASTORE_PRINCIPAL, principalID);
        }
        Job job = new Job(conf, "typedatacheck");
        // initialize HCatInputFormat
        HCatInputFormat.setInput(job, dbName, tableName);
        HCatSchema s = HCatInputFormat.getTableSchema(job);
        job.getConfiguration().set(SCHEMA_KEY, schemaStr);
        job.getConfiguration().set(DELIM, outputdelim);
        job.setInputFormatClass(HCatInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);
        job.setJarByClass(TypeDataCheck.class);
        job.setMapperClass(TypeDataCheckMapper.class);
        job.setNumReduceTasks(0);
        job.setOutputKeyClass(Long.class);
        job.setOutputValueClass(Text.class);
        FileOutputFormat.setOutputPath(job, new Path(outputDir));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
        return 0;
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) HCatSchema(org.apache.hive.hcatalog.data.schema.HCatSchema) Job(org.apache.hadoop.mapreduce.Job) IOException(java.io.IOException) GenericOptionsParser(org.apache.hadoop.util.GenericOptionsParser)

Aggregations

HCatSchema (org.apache.hive.hcatalog.data.schema.HCatSchema)45 HCatFieldSchema (org.apache.hive.hcatalog.data.schema.HCatFieldSchema)21 Job (org.apache.hadoop.mapreduce.Job)17 ArrayList (java.util.ArrayList)14 Configuration (org.apache.hadoop.conf.Configuration)13 HashMap (java.util.HashMap)10 GenericOptionsParser (org.apache.hadoop.util.GenericOptionsParser)10 IOException (java.io.IOException)8 HCatException (org.apache.hive.hcatalog.common.HCatException)8 Table (org.apache.hadoop.hive.ql.metadata.Table)6 Test (org.junit.Test)6 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)5 Properties (java.util.Properties)4 Path (org.apache.hadoop.fs.Path)4 ResourceSchema (org.apache.pig.ResourceSchema)4 FrontendException (org.apache.pig.impl.logicalLayer.FrontendException)4 StorageDescriptor (org.apache.hadoop.hive.metastore.api.StorageDescriptor)3 HCatRecord (org.apache.hive.hcatalog.data.HCatRecord)3 PigException (org.apache.pig.PigException)3 Map (java.util.Map)2