Search in sources :

Example 6 with Serializer

use of org.apache.hadoop.hive.serde2.Serializer in project hive by apache.

the class FileSinkOperator method initializeOp.

@Override
protected void initializeOp(Configuration hconf) throws HiveException {
    super.initializeOp(hconf);
    try {
        this.hconf = hconf;
        filesCreated = false;
        isNativeTable = !conf.getTableInfo().isNonNative();
        isTemporary = conf.isTemporary();
        multiFileSpray = conf.isMultiFileSpray();
        totalFiles = conf.getTotalFiles();
        numFiles = conf.getNumFiles();
        dpCtx = conf.getDynPartCtx();
        lbCtx = conf.getLbCtx();
        fsp = prevFsp = null;
        valToPaths = new HashMap<String, FSPaths>();
        taskId = Utilities.getTaskId(hconf);
        initializeSpecPath();
        fs = specPath.getFileSystem(hconf);
        try {
            createHiveOutputFormat(hconf);
        } catch (HiveException ex) {
            logOutputFormatError(hconf, ex);
            throw ex;
        }
        isCompressed = conf.getCompressed();
        parent = Utilities.toTempPath(conf.getDirName());
        statsFromRecordWriter = new boolean[numFiles];
        serializer = (Serializer) conf.getTableInfo().getDeserializerClass().newInstance();
        serializer.initialize(unsetNestedColumnPaths(hconf), conf.getTableInfo().getProperties());
        outputClass = serializer.getSerializedClass();
        if (isLogInfoEnabled) {
            LOG.info("Using serializer : " + serializer + " and formatter : " + hiveOutputFormat + (isCompressed ? " with compression" : ""));
        }
        // Timeout is chosen to make sure that even if one iteration takes more than
        // half of the script.timeout but less than script.timeout, we will still
        // be able to report progress.
        timeOut = hconf.getInt("mapred.healthChecker.script.timeout", 600000) / 2;
        if (hconf instanceof JobConf) {
            jc = (JobConf) hconf;
        } else {
            // test code path
            jc = new JobConf(hconf);
        }
        if (multiFileSpray) {
            partitionEval = new ExprNodeEvaluator[conf.getPartitionCols().size()];
            int i = 0;
            for (ExprNodeDesc e : conf.getPartitionCols()) {
                partitionEval[i++] = ExprNodeEvaluatorFactory.get(e);
            }
            partitionObjectInspectors = initEvaluators(partitionEval, outputObjInspector);
            prtner = (HivePartitioner<HiveKey, Object>) ReflectionUtils.newInstance(jc.getPartitionerClass(), null);
        }
        if (dpCtx != null) {
            dpSetup();
        }
        if (lbCtx != null) {
            lbSetup();
        }
        if (!bDynParts) {
            fsp = new FSPaths(specPath);
            // createBucketFiles(fsp);
            if (!this.isSkewedStoredAsSubDirectories) {
                // special entry for non-DP case
                valToPaths.put("", fsp);
            }
        }
        final StoragePolicyValue tmpStorage = StoragePolicyValue.lookup(HiveConf.getVar(hconf, HIVE_TEMPORARY_TABLE_STORAGE));
        if (isTemporary && fsp != null && tmpStorage != StoragePolicyValue.DEFAULT) {
            final Path outputPath = fsp.taskOutputTempPath;
            StoragePolicyShim shim = ShimLoader.getHadoopShims().getStoragePolicyShim(fs);
            if (shim != null) {
                // directory creation is otherwise within the writers
                fs.mkdirs(outputPath);
                shim.setStoragePolicy(outputPath, tmpStorage);
            }
        }
        if (conf.getWriteType() == AcidUtils.Operation.UPDATE || conf.getWriteType() == AcidUtils.Operation.DELETE) {
            // ROW__ID is always in the first field
            recIdField = ((StructObjectInspector) outputObjInspector).getAllStructFieldRefs().get(0);
            recIdInspector = (StructObjectInspector) recIdField.getFieldObjectInspector();
            // bucket is the second field in the record id
            bucketField = recIdInspector.getAllStructFieldRefs().get(1);
            bucketInspector = (IntObjectInspector) bucketField.getFieldObjectInspector();
        }
        numRows = 0;
        cntr = 1;
        logEveryNRows = HiveConf.getLongVar(hconf, HiveConf.ConfVars.HIVE_LOG_N_RECORDS);
        statsMap.put(getCounterName(Counter.RECORDS_OUT), row_count);
    } catch (HiveException e) {
        throw e;
    } catch (Exception e) {
        e.printStackTrace();
        throw new HiveException(e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) StoragePolicyValue(org.apache.hadoop.hive.shims.HadoopShims.StoragePolicyValue) HiveFatalException(org.apache.hadoop.hive.ql.metadata.HiveFatalException) FileNotFoundException(java.io.FileNotFoundException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) IOException(java.io.IOException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) HiveKey(org.apache.hadoop.hive.ql.io.HiveKey) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) JobConf(org.apache.hadoop.mapred.JobConf) StoragePolicyShim(org.apache.hadoop.hive.shims.HadoopShims.StoragePolicyShim) SubStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.SubStructObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)

Example 7 with Serializer

use of org.apache.hadoop.hive.serde2.Serializer in project hive by apache.

the class Utilities method createEmptyBuckets.

/**
   * Check the existence of buckets according to bucket specification. Create empty buckets if
   * needed.
   *
   * @param hconf
   * @param paths A list of empty buckets to create
   * @param conf The definition of the FileSink.
   * @param reporter The mapreduce reporter object
   * @throws HiveException
   * @throws IOException
   */
private static void createEmptyBuckets(Configuration hconf, List<Path> paths, FileSinkDesc conf, Reporter reporter) throws HiveException, IOException {
    JobConf jc;
    if (hconf instanceof JobConf) {
        jc = new JobConf(hconf);
    } else {
        // test code path
        jc = new JobConf(hconf);
    }
    HiveOutputFormat<?, ?> hiveOutputFormat = null;
    Class<? extends Writable> outputClass = null;
    boolean isCompressed = conf.getCompressed();
    TableDesc tableInfo = conf.getTableInfo();
    try {
        Serializer serializer = (Serializer) tableInfo.getDeserializerClass().newInstance();
        serializer.initialize(null, tableInfo.getProperties());
        outputClass = serializer.getSerializedClass();
        hiveOutputFormat = HiveFileFormatUtils.getHiveOutputFormat(hconf, conf.getTableInfo());
    } catch (SerDeException e) {
        throw new HiveException(e);
    } catch (InstantiationException e) {
        throw new HiveException(e);
    } catch (IllegalAccessException e) {
        throw new HiveException(e);
    }
    for (Path path : paths) {
        RecordWriter writer = HiveFileFormatUtils.getRecordWriter(jc, hiveOutputFormat, outputClass, isCompressed, tableInfo.getProperties(), path, reporter);
        writer.close(false);
        LOG.info("created empty bucket for enforcing bucketing at " + path);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) RecordWriter(org.apache.hadoop.hive.ql.exec.FileSinkOperator.RecordWriter) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) JobConf(org.apache.hadoop.mapred.JobConf) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) Serializer(org.apache.hadoop.hive.serde2.Serializer)

Example 8 with Serializer

use of org.apache.hadoop.hive.serde2.Serializer in project hive by apache.

the class ReduceSinkOperator method initializeOp.

@Override
protected void initializeOp(Configuration hconf) throws HiveException {
    super.initializeOp(hconf);
    try {
        numRows = 0;
        cntr = 1;
        logEveryNRows = HiveConf.getLongVar(hconf, HiveConf.ConfVars.HIVE_LOG_N_RECORDS);
        statsMap.put(getCounterName(Counter.RECORDS_OUT_INTERMEDIATE, hconf), recordCounter);
        List<ExprNodeDesc> keys = conf.getKeyCols();
        if (isLogDebugEnabled) {
            LOG.debug("keys size is " + keys.size());
            for (ExprNodeDesc k : keys) {
                LOG.debug("Key exprNodeDesc " + k.getExprString());
            }
        }
        keyEval = new ExprNodeEvaluator[keys.size()];
        int i = 0;
        for (ExprNodeDesc e : keys) {
            if (e instanceof ExprNodeConstantDesc && (BUCKET_NUMBER_COL_NAME).equals(((ExprNodeConstantDesc) e).getValue())) {
                buckColIdxInKeyForSdpo = i;
            }
            keyEval[i++] = ExprNodeEvaluatorFactory.get(e);
        }
        numDistributionKeys = conf.getNumDistributionKeys();
        distinctColIndices = conf.getDistinctColumnIndices();
        numDistinctExprs = distinctColIndices.size();
        valueEval = new ExprNodeEvaluator[conf.getValueCols().size()];
        i = 0;
        for (ExprNodeDesc e : conf.getValueCols()) {
            valueEval[i++] = ExprNodeEvaluatorFactory.get(e);
        }
        partitionEval = new ExprNodeEvaluator[conf.getPartitionCols().size()];
        i = 0;
        for (ExprNodeDesc e : conf.getPartitionCols()) {
            int index = ExprNodeDescUtils.indexOf(e, keys);
            partitionEval[i++] = index < 0 ? ExprNodeEvaluatorFactory.get(e) : keyEval[index];
        }
        if (conf.getBucketCols() != null && !conf.getBucketCols().isEmpty()) {
            bucketEval = new ExprNodeEvaluator[conf.getBucketCols().size()];
            i = 0;
            for (ExprNodeDesc e : conf.getBucketCols()) {
                int index = ExprNodeDescUtils.indexOf(e, keys);
                bucketEval[i++] = index < 0 ? ExprNodeEvaluatorFactory.get(e) : keyEval[index];
            }
            buckColIdxInKey = conf.getPartitionCols().size();
        }
        tag = conf.getTag();
        tagByte[0] = (byte) tag;
        skipTag = conf.getSkipTag();
        if (isLogInfoEnabled) {
            LOG.info("Using tag = " + tag);
        }
        TableDesc keyTableDesc = conf.getKeySerializeInfo();
        keySerializer = (Serializer) keyTableDesc.getDeserializerClass().newInstance();
        keySerializer.initialize(null, keyTableDesc.getProperties());
        keyIsText = keySerializer.getSerializedClass().equals(Text.class);
        TableDesc valueTableDesc = conf.getValueSerializeInfo();
        valueSerializer = (Serializer) valueTableDesc.getDeserializerClass().newInstance();
        valueSerializer.initialize(null, valueTableDesc.getProperties());
        int limit = conf.getTopN();
        float memUsage = conf.getTopNMemoryUsage();
        if (limit >= 0 && memUsage > 0) {
            reducerHash = conf.isPTFReduceSink() ? new PTFTopNHash() : new TopNHash();
            reducerHash.initialize(limit, memUsage, conf.isMapGroupBy(), this, conf, hconf);
        }
        useUniformHash = conf.getReducerTraits().contains(UNIFORM);
        firstRow = true;
    } catch (Exception e) {
        String msg = "Error initializing ReduceSinkOperator: " + e.getMessage();
        LOG.error(msg, e);
        throw new RuntimeException(e);
    }
}
Also used : ExprNodeConstantDesc(org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc) Text(org.apache.hadoop.io.Text) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) IOException(java.io.IOException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException)

Example 9 with Serializer

use of org.apache.hadoop.hive.serde2.Serializer in project hive by apache.

the class SkewJoinHandler method initiliaze.

public void initiliaze(Configuration hconf) {
    this.hconf = hconf;
    JoinDesc desc = joinOp.getConf();
    skewKeyDefinition = desc.getSkewKeyDefinition();
    skewKeysTableObjectInspector = new HashMap<Byte, StructObjectInspector>(numAliases);
    tblDesc = desc.getSkewKeysValuesTables();
    tblSerializers = new HashMap<Byte, AbstractSerDe>(numAliases);
    bigKeysExistingMap = new HashMap<Byte, Boolean>(numAliases);
    taskId = Utilities.getTaskId(hconf);
    int[][] filterMap = desc.getFilterMap();
    for (int i = 0; i < numAliases; i++) {
        Byte alias = conf.getTagOrder()[i];
        List<ObjectInspector> skewTableKeyInspectors = new ArrayList<ObjectInspector>();
        StructObjectInspector soi = (StructObjectInspector) joinOp.inputObjInspectors[alias];
        StructField sf = soi.getStructFieldRef(Utilities.ReduceField.KEY.toString());
        List<? extends StructField> keyFields = ((StructObjectInspector) sf.getFieldObjectInspector()).getAllStructFieldRefs();
        int keyFieldSize = keyFields.size();
        for (int k = 0; k < keyFieldSize; k++) {
            skewTableKeyInspectors.add(keyFields.get(k).getFieldObjectInspector());
        }
        TableDesc joinKeyDesc = desc.getKeyTableDesc();
        List<String> keyColNames = Utilities.getColumnNames(joinKeyDesc.getProperties());
        StructObjectInspector structTblKeyInpector = ObjectInspectorFactory.getStandardStructObjectInspector(keyColNames, skewTableKeyInspectors);
        try {
            AbstractSerDe serializer = (AbstractSerDe) ReflectionUtils.newInstance(tblDesc.get(alias).getDeserializerClass(), null);
            SerDeUtils.initializeSerDe(serializer, null, tblDesc.get(alias).getProperties(), null);
            tblSerializers.put((byte) i, serializer);
        } catch (SerDeException e) {
            LOG.error("Skewjoin will be disabled due to " + e.getMessage(), e);
            joinOp.handleSkewJoin = false;
            break;
        }
        boolean hasFilter = filterMap != null && filterMap[i] != null;
        TableDesc valTblDesc = JoinUtil.getSpillTableDesc(alias, joinOp.spillTableDesc, conf, !hasFilter);
        List<String> valColNames = new ArrayList<String>();
        if (valTblDesc != null) {
            valColNames = Utilities.getColumnNames(valTblDesc.getProperties());
        }
        StructObjectInspector structTblValInpector = ObjectInspectorFactory.getStandardStructObjectInspector(valColNames, joinOp.joinValuesStandardObjectInspectors[i]);
        StructObjectInspector structTblInpector = ObjectInspectorFactory.getUnionStructObjectInspector(Arrays.asList(structTblValInpector, structTblKeyInpector));
        skewKeysTableObjectInspector.put((byte) i, structTblInpector);
    }
    // reset rowcontainer's serde, objectinspector, and tableDesc.
    for (int i = 0; i < numAliases; i++) {
        Byte alias = conf.getTagOrder()[i];
        RowContainer<ArrayList<Object>> rc = (RowContainer) joinOp.storage[i];
        if (rc != null) {
            rc.setSerDe(tblSerializers.get((byte) i), skewKeysTableObjectInspector.get((byte) i));
            rc.setTableDesc(tblDesc.get(alias));
        }
    }
}
Also used : RowContainer(org.apache.hadoop.hive.ql.exec.persistence.RowContainer) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) ArrayList(java.util.ArrayList) AbstractSerDe(org.apache.hadoop.hive.serde2.AbstractSerDe) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) JoinDesc(org.apache.hadoop.hive.ql.plan.JoinDesc) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)

Example 10 with Serializer

use of org.apache.hadoop.hive.serde2.Serializer in project hive by apache.

the class TestHiveAccumuloTableOutputFormat method testWriteToMockInstance.

@Test
public void testWriteToMockInstance() throws Exception {
    Instance inst = new MockInstance(test.getMethodName());
    Connector conn = inst.getConnector("root", new PasswordToken(""));
    HiveAccumuloTableOutputFormat outputFormat = new HiveAccumuloTableOutputFormat();
    String table = test.getMethodName();
    conn.tableOperations().create(table);
    JobConf conf = new JobConf();
    conf.set(AccumuloConnectionParameters.INSTANCE_NAME, inst.getInstanceName());
    conf.set(AccumuloConnectionParameters.USER_NAME, "root");
    conf.set(AccumuloConnectionParameters.USER_PASS, "");
    conf.setBoolean(AccumuloConnectionParameters.USE_MOCK_INSTANCE, true);
    conf.set(AccumuloConnectionParameters.TABLE_NAME, test.getMethodName());
    FileSystem local = FileSystem.getLocal(conf);
    outputFormat.checkOutputSpecs(local, conf);
    RecordWriter<Text, Mutation> recordWriter = outputFormat.getRecordWriter(local, conf, null, null);
    List<String> names = Arrays.asList("row", "col1", "col2");
    List<TypeInfo> types = Arrays.<TypeInfo>asList(TypeInfoFactory.stringTypeInfo, TypeInfoFactory.stringTypeInfo, TypeInfoFactory.stringTypeInfo);
    Properties tableProperties = new Properties();
    tableProperties.setProperty(AccumuloSerDeParameters.COLUMN_MAPPINGS, ":rowID,cf:cq1,cf:cq2");
    tableProperties.setProperty(serdeConstants.FIELD_DELIM, " ");
    tableProperties.setProperty(serdeConstants.LIST_COLUMNS, Joiner.on(',').join(names));
    tableProperties.setProperty(serdeConstants.LIST_COLUMN_TYPES, Joiner.on(',').join(types));
    AccumuloSerDeParameters accumuloSerDeParams = new AccumuloSerDeParameters(new Configuration(), tableProperties, AccumuloSerDe.class.getSimpleName());
    LazySerDeParameters serDeParams = accumuloSerDeParams.getSerDeParameters();
    AccumuloRowSerializer serializer = new AccumuloRowSerializer(0, serDeParams, accumuloSerDeParams.getColumnMappings(), AccumuloSerDeParameters.DEFAULT_VISIBILITY_LABEL, accumuloSerDeParams.getRowIdFactory());
    TypeInfo stringTypeInfo = TypeInfoFactory.getPrimitiveTypeInfo(serdeConstants.STRING_TYPE_NAME);
    LazySimpleStructObjectInspector structOI = (LazySimpleStructObjectInspector) LazyFactory.createLazyStructInspector(Arrays.asList("row", "cq1", "cq2"), Arrays.asList(stringTypeInfo, stringTypeInfo, stringTypeInfo), serDeParams.getSeparators(), serDeParams.getNullSequence(), serDeParams.isLastColumnTakesRest(), serDeParams.isEscaped(), serDeParams.getEscapeChar());
    LazyStruct struct = (LazyStruct) LazyFactory.createLazyObject(structOI);
    ByteArrayRef bytes = new ByteArrayRef();
    bytes.setData("row value1 value2".getBytes());
    struct.init(bytes, 0, bytes.getData().length);
    // Serialize the struct into a mutation
    Mutation m = serializer.serialize(struct, structOI);
    // Write the mutation
    recordWriter.write(new Text(table), m);
    // Close the writer
    recordWriter.close(null);
    Iterator<Entry<Key, Value>> iter = conn.createScanner(table, new Authorizations()).iterator();
    Assert.assertTrue("Iterator did not have an element as expected", iter.hasNext());
    Entry<Key, Value> entry = iter.next();
    Key k = entry.getKey();
    Value v = entry.getValue();
    Assert.assertEquals("row", k.getRow().toString());
    Assert.assertEquals("cf", k.getColumnFamily().toString());
    Assert.assertEquals("cq1", k.getColumnQualifier().toString());
    Assert.assertEquals("", k.getColumnVisibility().toString());
    Assert.assertEquals("value1", new String(v.get()));
    Assert.assertTrue("Iterator did not have an element as expected", iter.hasNext());
    entry = iter.next();
    k = entry.getKey();
    v = entry.getValue();
    Assert.assertEquals("row", k.getRow().toString());
    Assert.assertEquals("cf", k.getColumnFamily().toString());
    Assert.assertEquals("cq2", k.getColumnQualifier().toString());
    Assert.assertEquals("", k.getColumnVisibility().toString());
    Assert.assertEquals("value2", new String(v.get()));
    Assert.assertFalse("Iterator unexpectedly had more data", iter.hasNext());
}
Also used : Connector(org.apache.accumulo.core.client.Connector) Configuration(org.apache.hadoop.conf.Configuration) MockInstance(org.apache.accumulo.core.client.mock.MockInstance) Instance(org.apache.accumulo.core.client.Instance) LazySerDeParameters(org.apache.hadoop.hive.serde2.lazy.LazySerDeParameters) Properties(java.util.Properties) PasswordToken(org.apache.accumulo.core.client.security.tokens.PasswordToken) Entry(java.util.Map.Entry) MockInstance(org.apache.accumulo.core.client.mock.MockInstance) FileSystem(org.apache.hadoop.fs.FileSystem) JobConf(org.apache.hadoop.mapred.JobConf) AccumuloRowSerializer(org.apache.hadoop.hive.accumulo.serde.AccumuloRowSerializer) LazyStruct(org.apache.hadoop.hive.serde2.lazy.LazyStruct) Authorizations(org.apache.accumulo.core.security.Authorizations) LazySimpleStructObjectInspector(org.apache.hadoop.hive.serde2.lazy.objectinspector.LazySimpleStructObjectInspector) Text(org.apache.hadoop.io.Text) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) AccumuloSerDe(org.apache.hadoop.hive.accumulo.serde.AccumuloSerDe) AccumuloSerDeParameters(org.apache.hadoop.hive.accumulo.serde.AccumuloSerDeParameters) ByteArrayRef(org.apache.hadoop.hive.serde2.lazy.ByteArrayRef) Value(org.apache.accumulo.core.data.Value) Mutation(org.apache.accumulo.core.data.Mutation) Key(org.apache.accumulo.core.data.Key) Test(org.junit.Test)

Aggregations

TypeInfo (org.apache.hadoop.hive.serde2.typeinfo.TypeInfo)11 LazySerDeParameters (org.apache.hadoop.hive.serde2.lazy.LazySerDeParameters)10 LazySimpleStructObjectInspector (org.apache.hadoop.hive.serde2.lazy.objectinspector.LazySimpleStructObjectInspector)10 Properties (java.util.Properties)9 Test (org.junit.Test)9 Mutation (org.apache.accumulo.core.data.Mutation)8 Configuration (org.apache.hadoop.conf.Configuration)8 ByteArrayRef (org.apache.hadoop.hive.serde2.lazy.ByteArrayRef)8 LazyStruct (org.apache.hadoop.hive.serde2.lazy.LazyStruct)8 JobConf (org.apache.hadoop.mapred.JobConf)8 SerDeException (org.apache.hadoop.hive.serde2.SerDeException)7 Serializer (org.apache.hadoop.hive.serde2.Serializer)7 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)6 ArrayList (java.util.ArrayList)5 ColumnVisibility (org.apache.accumulo.core.security.ColumnVisibility)5 StructField (org.apache.hadoop.hive.serde2.objectinspector.StructField)5 Text (org.apache.hadoop.io.Text)5 Entry (java.util.Map.Entry)4 Connector (org.apache.accumulo.core.client.Connector)4 Instance (org.apache.accumulo.core.client.Instance)4