Search in sources :

Example 6 with HCatRecord

use of org.apache.hive.hcatalog.data.HCatRecord in project hive by apache.

the class FileOutputFormatContainer method getRecordWriter.

@Override
public RecordWriter<WritableComparable<?>, HCatRecord> getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException {
    // this needs to be manually set, under normal circumstances MR Task does this
    setWorkOutputPath(context);
    // Configure the output key and value classes.
    // This is required for writing null as key for file based tables.
    context.getConfiguration().set("mapred.output.key.class", NullWritable.class.getName());
    String jobInfoString = context.getConfiguration().get(HCatConstants.HCAT_KEY_OUTPUT_INFO);
    OutputJobInfo jobInfo = (OutputJobInfo) HCatUtil.deserialize(jobInfoString);
    StorerInfo storeInfo = jobInfo.getTableInfo().getStorerInfo();
    HiveStorageHandler storageHandler = HCatUtil.getStorageHandler(context.getConfiguration(), storeInfo);
    Class<? extends AbstractSerDe> serde = storageHandler.getSerDeClass();
    AbstractSerDe sd = (AbstractSerDe) ReflectionUtils.newInstance(serde, context.getConfiguration());
    context.getConfiguration().set("mapred.output.value.class", sd.getSerializedClass().getName());
    RecordWriter<WritableComparable<?>, HCatRecord> rw;
    if (HCatBaseOutputFormat.getJobInfo(context.getConfiguration()).isDynamicPartitioningUsed()) {
        // When Dynamic partitioning is used, the RecordWriter instance initialized here isn't used. Can use null.
        // (That's because records can't be written until the values of the dynamic partitions are deduced.
        // By that time, a new local instance of RecordWriter, with the correct output-path, will be constructed.)
        rw = new DynamicPartitionFileRecordWriterContainer((org.apache.hadoop.mapred.RecordWriter) null, context);
    } else {
        Path parentDir = new Path(context.getConfiguration().get("mapred.work.output.dir"));
        Path childPath = new Path(parentDir, FileOutputFormat.getUniqueName(new JobConf(context.getConfiguration()), context.getConfiguration().get("mapreduce.output.basename", "part")));
        rw = new StaticPartitionFileRecordWriterContainer(getBaseOutputFormat().getRecordWriter(parentDir.getFileSystem(context.getConfiguration()), new JobConf(context.getConfiguration()), childPath.toString(), InternalUtil.createReporter(context)), context);
    }
    return rw;
}
Also used : Path(org.apache.hadoop.fs.Path) HiveStorageHandler(org.apache.hadoop.hive.ql.metadata.HiveStorageHandler) NullWritable(org.apache.hadoop.io.NullWritable) AbstractSerDe(org.apache.hadoop.hive.serde2.AbstractSerDe) RecordWriter(org.apache.hadoop.mapreduce.RecordWriter) WritableComparable(org.apache.hadoop.io.WritableComparable) JobConf(org.apache.hadoop.mapred.JobConf) HCatRecord(org.apache.hive.hcatalog.data.HCatRecord)

Example 7 with HCatRecord

use of org.apache.hive.hcatalog.data.HCatRecord in project hive by apache.

the class HCatRecordReader method nextKeyValue.

/**
 * Check if the wrapped RecordReader has another record, and if so convert it into an
 * HCatRecord. We both check for records and convert here so a configurable percent of
 * bad records can be tolerated.
 *
 * @return if there is a next record
 * @throws IOException on error
 * @throws InterruptedException on error
 */
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
    if (currentKey == null) {
        currentKey = baseRecordReader.createKey();
        currentValue = baseRecordReader.createValue();
    }
    while (baseRecordReader.next(currentKey, currentValue)) {
        HCatRecord r = null;
        Throwable t = null;
        errorTracker.incRecords();
        try {
            Object o = deserializer.deserialize(currentValue);
            r = new LazyHCatRecord(o, deserializer.getObjectInspector());
        } catch (Throwable throwable) {
            t = throwable;
        }
        if (r == null) {
            errorTracker.incErrors(t);
            continue;
        }
        DefaultHCatRecord dr = new DefaultHCatRecord(outputSchema.size());
        int i = 0;
        for (String fieldName : outputSchema.getFieldNames()) {
            if (dataSchema.getPosition(fieldName) != null) {
                dr.set(i, r.get(fieldName, dataSchema));
            } else {
                dr.set(i, valuesNotInDataCols.get(fieldName));
            }
            i++;
        }
        currentHCatRecord = dr;
        return true;
    }
    return false;
}
Also used : DefaultHCatRecord(org.apache.hive.hcatalog.data.DefaultHCatRecord) LazyHCatRecord(org.apache.hive.hcatalog.data.LazyHCatRecord) DefaultHCatRecord(org.apache.hive.hcatalog.data.DefaultHCatRecord) HCatRecord(org.apache.hive.hcatalog.data.HCatRecord) LazyHCatRecord(org.apache.hive.hcatalog.data.LazyHCatRecord)

Example 8 with HCatRecord

use of org.apache.hive.hcatalog.data.HCatRecord in project hive by apache.

the class TestHCatPartitioned method columnOrderChangeTest.

// check behavior while change the order of columns
private void columnOrderChangeTest() throws Exception {
    HCatSchema tableSchema = getTableSchema();
    assertEquals(5, tableSchema.getFields().size());
    partitionColumns = new ArrayList<HCatFieldSchema>();
    partitionColumns.add(HCatSchemaUtils.getHCatFieldSchema(new FieldSchema("c1", serdeConstants.INT_TYPE_NAME, "")));
    partitionColumns.add(HCatSchemaUtils.getHCatFieldSchema(new FieldSchema("c3", serdeConstants.STRING_TYPE_NAME, "")));
    partitionColumns.add(HCatSchemaUtils.getHCatFieldSchema(new FieldSchema("c2", serdeConstants.STRING_TYPE_NAME, "")));
    writeRecords = new ArrayList<HCatRecord>();
    for (int i = 0; i < 10; i++) {
        List<Object> objList = new ArrayList<Object>();
        objList.add(i);
        objList.add("co strvalue" + i);
        objList.add("co str2value" + i);
        writeRecords.add(new DefaultHCatRecord(objList));
    }
    Map<String, String> partitionMap = new HashMap<String, String>();
    partitionMap.put("part1", "p1value8");
    partitionMap.put("part0", "508");
    Exception exc = null;
    try {
        runMRCreate(partitionMap, partitionColumns, writeRecords, 10, true);
    } catch (IOException e) {
        exc = e;
    }
    assertTrue(exc != null);
    assertTrue(exc instanceof HCatException);
    assertEquals(ErrorType.ERROR_SCHEMA_COLUMN_MISMATCH, ((HCatException) exc).getErrorType());
    partitionColumns = new ArrayList<HCatFieldSchema>();
    partitionColumns.add(HCatSchemaUtils.getHCatFieldSchema(new FieldSchema("c1", serdeConstants.INT_TYPE_NAME, "")));
    partitionColumns.add(HCatSchemaUtils.getHCatFieldSchema(new FieldSchema("c2", serdeConstants.STRING_TYPE_NAME, "")));
    writeRecords = new ArrayList<HCatRecord>();
    for (int i = 0; i < 10; i++) {
        List<Object> objList = new ArrayList<Object>();
        objList.add(i);
        objList.add("co strvalue" + i);
        writeRecords.add(new DefaultHCatRecord(objList));
    }
    runMRCreate(partitionMap, partitionColumns, writeRecords, 10, true);
    if (isTableImmutable()) {
        // Read should get 10 + 20 + 10 + 10 + 20 rows
        runMRRead(70);
    } else {
        // +20 from the duplicate publish
        runMRRead(90);
    }
}
Also used : HashMap(java.util.HashMap) HCatFieldSchema(org.apache.hive.hcatalog.data.schema.HCatFieldSchema) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) ArrayList(java.util.ArrayList) HCatException(org.apache.hive.hcatalog.common.HCatException) IOException(java.io.IOException) HCatException(org.apache.hive.hcatalog.common.HCatException) IOException(java.io.IOException) HCatFieldSchema(org.apache.hive.hcatalog.data.schema.HCatFieldSchema) HCatSchema(org.apache.hive.hcatalog.data.schema.HCatSchema) DefaultHCatRecord(org.apache.hive.hcatalog.data.DefaultHCatRecord) DefaultHCatRecord(org.apache.hive.hcatalog.data.DefaultHCatRecord) HCatRecord(org.apache.hive.hcatalog.data.HCatRecord)

Example 9 with HCatRecord

use of org.apache.hive.hcatalog.data.HCatRecord in project hive by apache.

the class TestE2EScenarios method copyTable.

private void copyTable(String in, String out) throws IOException, InterruptedException {
    Job ijob = new Job();
    Job ojob = new Job();
    HCatInputFormat inpy = new HCatInputFormat();
    inpy.setInput(ijob, null, in);
    HCatOutputFormat oupy = new HCatOutputFormat();
    oupy.setOutput(ojob, OutputJobInfo.create(null, out, new HashMap<String, String>()));
    // Test HCatContext
    System.err.println("HCatContext INSTANCE is present : " + HCatContext.INSTANCE.getConf().isPresent());
    if (HCatContext.INSTANCE.getConf().isPresent()) {
        System.err.println("HCatContext tinyint->int promotion says " + HCatContext.INSTANCE.getConf().get().getBoolean(HCatConstants.HCAT_DATA_TINY_SMALL_INT_PROMOTION, HCatConstants.HCAT_DATA_TINY_SMALL_INT_PROMOTION_DEFAULT));
    }
    HCatSchema tableSchema = inpy.getTableSchema(ijob.getConfiguration());
    System.err.println("Copying from [" + in + "] to [" + out + "] with schema : " + tableSchema.toString());
    oupy.setSchema(ojob, tableSchema);
    oupy.checkOutputSpecs(ojob);
    OutputCommitter oc = oupy.getOutputCommitter(createTaskAttemptContext(ojob.getConfiguration()));
    oc.setupJob(ojob);
    for (InputSplit split : inpy.getSplits(ijob)) {
        TaskAttemptContext rtaskContext = createTaskAttemptContext(ijob.getConfiguration());
        TaskAttemptContext wtaskContext = createTaskAttemptContext(ojob.getConfiguration());
        RecordReader<WritableComparable, HCatRecord> rr = inpy.createRecordReader(split, rtaskContext);
        rr.initialize(split, rtaskContext);
        OutputCommitter taskOc = oupy.getOutputCommitter(wtaskContext);
        taskOc.setupTask(wtaskContext);
        RecordWriter<WritableComparable<?>, HCatRecord> rw = oupy.getRecordWriter(wtaskContext);
        while (rr.nextKeyValue()) {
            rw.write(rr.getCurrentKey(), rr.getCurrentValue());
        }
        rw.close(wtaskContext);
        taskOc.commitTask(wtaskContext);
        rr.close();
    }
    oc.commitJob(ojob);
}
Also used : OutputCommitter(org.apache.hadoop.mapreduce.OutputCommitter) HCatSchema(org.apache.hive.hcatalog.data.schema.HCatSchema) HashMap(java.util.HashMap) WritableComparable(org.apache.hadoop.io.WritableComparable) HCatOutputFormat(org.apache.hive.hcatalog.mapreduce.HCatOutputFormat) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit) HCatInputFormat(org.apache.hive.hcatalog.mapreduce.HCatInputFormat) HCatRecord(org.apache.hive.hcatalog.data.HCatRecord)

Example 10 with HCatRecord

use of org.apache.hive.hcatalog.data.HCatRecord in project hive by apache.

the class DataReaderSlave method main.

public static void main(String[] args) throws IOException, ClassNotFoundException {
    ObjectInputStream ois = new ObjectInputStream(new FileInputStream(new File(args[0])));
    ReaderContext cntxt = (ReaderContext) ois.readObject();
    ois.close();
    String[] inpSlitsToRead = args[1].split(",");
    List<InputSplit> splits = cntxt.getSplits();
    for (int i = 0; i < inpSlitsToRead.length; i++) {
        InputSplit split = splits.get(Integer.parseInt(inpSlitsToRead[i]));
        HCatReader reader = DataTransferFactory.getHCatReader(split, cntxt.getConf());
        Iterator<HCatRecord> itr = reader.read();
        File f = new File(args[2] + "-" + i);
        f.delete();
        BufferedWriter outFile = new BufferedWriter(new FileWriter(f));
        while (itr.hasNext()) {
            String rec = itr.next().toString().replaceFirst("\\s+$", "");
            System.err.println(rec);
            outFile.write(rec + "\n");
        }
        outFile.close();
    }
}
Also used : FileWriter(java.io.FileWriter) FileInputStream(java.io.FileInputStream) HCatReader(org.apache.hive.hcatalog.data.transfer.HCatReader) BufferedWriter(java.io.BufferedWriter) ReaderContext(org.apache.hive.hcatalog.data.transfer.ReaderContext) File(java.io.File) InputSplit(org.apache.hadoop.mapreduce.InputSplit) HCatRecord(org.apache.hive.hcatalog.data.HCatRecord) ObjectInputStream(java.io.ObjectInputStream)

Aggregations

HCatRecord (org.apache.hive.hcatalog.data.HCatRecord)14 IOException (java.io.IOException)6 DefaultHCatRecord (org.apache.hive.hcatalog.data.DefaultHCatRecord)6 ArrayList (java.util.ArrayList)4 WritableComparable (org.apache.hadoop.io.WritableComparable)4 HCatException (org.apache.hive.hcatalog.common.HCatException)4 HashMap (java.util.HashMap)3 TaskAttemptContext (org.apache.hadoop.mapreduce.TaskAttemptContext)3 HCatSchema (org.apache.hive.hcatalog.data.schema.HCatSchema)3 ReaderContext (org.apache.hive.hcatalog.data.transfer.ReaderContext)3 Path (org.apache.hadoop.fs.Path)2 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)2 InputSplit (org.apache.hadoop.mapreduce.InputSplit)2 Job (org.apache.hadoop.mapreduce.Job)2 OutputCommitter (org.apache.hadoop.mapreduce.OutputCommitter)2 TaskAttemptID (org.apache.hadoop.mapreduce.TaskAttemptID)2 HCatFieldSchema (org.apache.hive.hcatalog.data.schema.HCatFieldSchema)2 HCatReader (org.apache.hive.hcatalog.data.transfer.HCatReader)2 HCatInputFormat (org.apache.hive.hcatalog.mapreduce.HCatInputFormat)2 HCatOutputFormat (org.apache.hive.hcatalog.mapreduce.HCatOutputFormat)2