use of org.apache.hive.hcatalog.data.schema.HCatSchema in project hive by apache.
the class TestE2EScenarios method copyTable.
private void copyTable(String in, String out) throws IOException, InterruptedException {
Job ijob = new Job();
Job ojob = new Job();
HCatInputFormat inpy = new HCatInputFormat();
inpy.setInput(ijob, null, in);
HCatOutputFormat oupy = new HCatOutputFormat();
oupy.setOutput(ojob, OutputJobInfo.create(null, out, new HashMap<String, String>()));
// Test HCatContext
System.err.println("HCatContext INSTANCE is present : " + HCatContext.INSTANCE.getConf().isPresent());
if (HCatContext.INSTANCE.getConf().isPresent()) {
System.err.println("HCatContext tinyint->int promotion says " + HCatContext.INSTANCE.getConf().get().getBoolean(HCatConstants.HCAT_DATA_TINY_SMALL_INT_PROMOTION, HCatConstants.HCAT_DATA_TINY_SMALL_INT_PROMOTION_DEFAULT));
}
HCatSchema tableSchema = inpy.getTableSchema(ijob.getConfiguration());
System.err.println("Copying from [" + in + "] to [" + out + "] with schema : " + tableSchema.toString());
oupy.setSchema(ojob, tableSchema);
oupy.checkOutputSpecs(ojob);
OutputCommitter oc = oupy.getOutputCommitter(createTaskAttemptContext(ojob.getConfiguration()));
oc.setupJob(ojob);
for (InputSplit split : inpy.getSplits(ijob)) {
TaskAttemptContext rtaskContext = createTaskAttemptContext(ijob.getConfiguration());
TaskAttemptContext wtaskContext = createTaskAttemptContext(ojob.getConfiguration());
RecordReader<WritableComparable, HCatRecord> rr = inpy.createRecordReader(split, rtaskContext);
rr.initialize(split, rtaskContext);
OutputCommitter taskOc = oupy.getOutputCommitter(wtaskContext);
taskOc.setupTask(wtaskContext);
RecordWriter<WritableComparable<?>, HCatRecord> rw = oupy.getRecordWriter(wtaskContext);
while (rr.nextKeyValue()) {
rw.write(rr.getCurrentKey(), rr.getCurrentValue());
}
rw.close(wtaskContext);
taskOc.commitTask(wtaskContext);
rr.close();
}
oc.commitJob(ojob);
}
use of org.apache.hive.hcatalog.data.schema.HCatSchema in project hive by apache.
the class ReadWrite method run.
public int run(String[] args) throws Exception {
Configuration conf = getConf();
args = new GenericOptionsParser(conf, args).getRemainingArgs();
String serverUri = args[0];
String inputTableName = args[1];
String outputTableName = args[2];
String dbName = null;
String principalID = System.getProperty(HCatConstants.HCAT_METASTORE_PRINCIPAL);
if (principalID != null)
conf.set(HCatConstants.HCAT_METASTORE_PRINCIPAL, principalID);
Job job = new Job(conf, "ReadWrite");
HCatInputFormat.setInput(job, dbName, inputTableName);
// initialize HCatOutputFormat
job.setInputFormatClass(HCatInputFormat.class);
job.setJarByClass(ReadWrite.class);
job.setMapperClass(Map.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(DefaultHCatRecord.class);
HCatOutputFormat.setOutput(job, OutputJobInfo.create(dbName, outputTableName, null));
HCatSchema s = HCatInputFormat.getTableSchema(job);
System.err.println("INFO: output schema explicitly set for writing:" + s);
HCatOutputFormat.setSchema(job, s);
job.setOutputFormatClass(HCatOutputFormat.class);
return (job.waitForCompletion(true) ? 0 : 1);
}
use of org.apache.hive.hcatalog.data.schema.HCatSchema in project hive by apache.
the class TestPigHCatUtil method testGetBagSubSchemaConfigured.
@Test
public void testGetBagSubSchemaConfigured() throws Exception {
// NOTE: pig-0.8 sets client system properties by actually getting the client
// system properties. Starting in pig-0.9 you must pass the properties in.
// When updating our pig dependency this will need updated.
System.setProperty(HCatConstants.HCAT_PIG_INNER_TUPLE_NAME, "t");
System.setProperty(HCatConstants.HCAT_PIG_INNER_FIELD_NAME, "FIELDNAME_tuple");
UDFContext.getUDFContext().setClientSystemProps(System.getProperties());
// Define the expected schema.
ResourceFieldSchema[] bagSubFieldSchemas = new ResourceFieldSchema[1];
bagSubFieldSchemas[0] = new ResourceFieldSchema().setName("t").setDescription("The tuple in the bag").setType(DataType.TUPLE);
ResourceFieldSchema[] innerTupleFieldSchemas = new ResourceFieldSchema[1];
innerTupleFieldSchemas[0] = new ResourceFieldSchema().setName("llama_tuple").setType(DataType.CHARARRAY);
bagSubFieldSchemas[0].setSchema(new ResourceSchema().setFields(innerTupleFieldSchemas));
ResourceSchema expected = new ResourceSchema().setFields(bagSubFieldSchemas);
// Get the actual converted schema.
HCatSchema actualHCatSchema = new HCatSchema(Lists.newArrayList(new HCatFieldSchema("innerLlama", HCatFieldSchema.Type.STRING, null)));
HCatFieldSchema actualHCatFieldSchema = new HCatFieldSchema("llama", HCatFieldSchema.Type.ARRAY, actualHCatSchema, null);
ResourceSchema actual = PigHCatUtil.getBagSubSchema(actualHCatFieldSchema);
Assert.assertEquals(expected.toString(), actual.toString());
// Clean up System properties that were set by this test
System.clearProperty(HCatConstants.HCAT_PIG_INNER_TUPLE_NAME);
System.clearProperty(HCatConstants.HCAT_PIG_INNER_FIELD_NAME);
}
use of org.apache.hive.hcatalog.data.schema.HCatSchema in project hive by apache.
the class StoreDemo method main.
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
args = new GenericOptionsParser(conf, args).getRemainingArgs();
String[] otherArgs = new String[1];
int j = 0;
for (int i = 0; i < args.length; i++) {
if (args[i].equals("-libjars")) {
// generic options parser doesn't seem to work!
conf.set("tmpjars", args[i + 1]);
// skip it , the for loop will skip its value
i = i + 1;
} else {
otherArgs[j++] = args[i];
}
}
if (otherArgs.length != 1) {
usage();
}
String serverUri = otherArgs[0];
String tableName = NUMBERS_TABLE_NAME;
String dbName = "default";
Map<String, String> outputPartitionKvps = new HashMap<String, String>();
String outputTableName = NUMBERS_PARTITIONED_TABLE_NAME;
outputPartitionKvps.put("datestamp", "20100102");
String principalID = System.getProperty(HCatConstants.HCAT_METASTORE_PRINCIPAL);
if (principalID != null)
conf.set(HCatConstants.HCAT_METASTORE_PRINCIPAL, principalID);
Job job = new Job(conf, "storedemo");
// initialize HCatInputFormat
HCatInputFormat.setInput(job, dbName, tableName);
// initialize HCatOutputFormat
HCatOutputFormat.setOutput(job, OutputJobInfo.create(dbName, outputTableName, outputPartitionKvps));
// test with and without specifying schema randomly
HCatSchema s = HCatInputFormat.getTableSchema(job);
System.err.println("INFO: output schema explicitly set for writing:" + s);
HCatOutputFormat.setSchema(job, s);
job.setInputFormatClass(HCatInputFormat.class);
job.setOutputFormatClass(HCatOutputFormat.class);
job.setJarByClass(StoreDemo.class);
job.setMapperClass(SumMapper.class);
job.setOutputKeyClass(IntWritable.class);
job.setNumReduceTasks(0);
job.setOutputValueClass(DefaultHCatRecord.class);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
use of org.apache.hive.hcatalog.data.schema.HCatSchema in project hive by apache.
the class TypeDataCheck method run.
public int run(String[] args) {
try {
args = new GenericOptionsParser(conf, args).getRemainingArgs();
String[] otherArgs = new String[5];
int j = 0;
for (int i = 0; i < args.length; i++) {
if (args[i].equals("-libjars")) {
conf.set("tmpjars", args[i + 1]);
// skip it , the for loop will skip its value
i = i + 1;
} else {
otherArgs[j++] = args[i];
}
}
if (otherArgs.length != 5) {
System.err.println("Other args:" + Arrays.asList(otherArgs));
System.err.println("Usage: hadoop jar testudf.jar typedatacheck " + "<serveruri> <tablename> <hive types of cols + delimited> " + "<output dir> <tab|ctrla> <-libjars hive-hcat jar>\n" + "The <tab|ctrla> argument controls the output delimiter.\n" + "The hcat jar location should be specified as file://<full path to jar>\n");
System.err.println(" The <tab|ctrla> argument controls the output delimiter.");
System.exit(2);
}
String serverUri = otherArgs[0];
String tableName = otherArgs[1];
String schemaStr = otherArgs[2];
String outputDir = otherArgs[3];
String outputdelim = otherArgs[4];
if (!outputdelim.equals("tab") && !outputdelim.equals("ctrla")) {
System.err.println("ERROR: Specify 'tab' or 'ctrla' for output delimiter");
}
String dbName = "default";
String principalID = System.getProperty(HCatConstants.HCAT_METASTORE_PRINCIPAL);
if (principalID != null) {
conf.set(HCatConstants.HCAT_METASTORE_PRINCIPAL, principalID);
}
Job job = new Job(conf, "typedatacheck");
// initialize HCatInputFormat
HCatInputFormat.setInput(job, dbName, tableName);
HCatSchema s = HCatInputFormat.getTableSchema(job);
job.getConfiguration().set(SCHEMA_KEY, schemaStr);
job.getConfiguration().set(DELIM, outputdelim);
job.setInputFormatClass(HCatInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setJarByClass(TypeDataCheck.class);
job.setMapperClass(TypeDataCheckMapper.class);
job.setNumReduceTasks(0);
job.setOutputKeyClass(Long.class);
job.setOutputValueClass(Text.class);
FileOutputFormat.setOutputPath(job, new Path(outputDir));
System.exit(job.waitForCompletion(true) ? 0 : 1);
return 0;
} catch (Exception e) {
throw new RuntimeException(e);
}
}
Aggregations