Search in sources :

Example 36 with TypeInfo

use of org.apache.hadoop.hive.serde2.typeinfo.TypeInfo in project nifi by apache.

the class ConvertAvroToORC method onTrigger.

@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
    FlowFile flowFile = session.get();
    if (flowFile == null) {
        return;
    }
    try {
        long startTime = System.currentTimeMillis();
        final long stripeSize = context.getProperty(STRIPE_SIZE).asDataSize(DataUnit.B).longValue();
        final int bufferSize = context.getProperty(BUFFER_SIZE).asDataSize(DataUnit.B).intValue();
        final CompressionKind compressionType = CompressionKind.valueOf(context.getProperty(COMPRESSION_TYPE).getValue());
        final AtomicReference<Schema> hiveAvroSchema = new AtomicReference<>(null);
        final AtomicInteger totalRecordCount = new AtomicInteger(0);
        final String fileName = flowFile.getAttribute(CoreAttributes.FILENAME.key());
        flowFile = session.write(flowFile, (rawIn, rawOut) -> {
            try (final InputStream in = new BufferedInputStream(rawIn);
                final OutputStream out = new BufferedOutputStream(rawOut);
                final DataFileStream<GenericRecord> reader = new DataFileStream<>(in, new GenericDatumReader<>())) {
                // Create ORC schema from Avro schema
                Schema avroSchema = reader.getSchema();
                TypeInfo orcSchema = NiFiOrcUtils.getOrcField(avroSchema);
                if (orcConfig == null) {
                    orcConfig = new Configuration();
                }
                OrcFlowFileWriter orcWriter = NiFiOrcUtils.createWriter(out, new Path(fileName), orcConfig, orcSchema, stripeSize, compressionType, bufferSize);
                try {
                    int recordCount = 0;
                    GenericRecord currRecord = null;
                    while (reader.hasNext()) {
                        currRecord = reader.next(currRecord);
                        List<Schema.Field> fields = currRecord.getSchema().getFields();
                        if (fields != null) {
                            Object[] row = new Object[fields.size()];
                            for (int i = 0; i < fields.size(); i++) {
                                Schema.Field field = fields.get(i);
                                Schema fieldSchema = field.schema();
                                Object o = currRecord.get(field.name());
                                try {
                                    row[i] = NiFiOrcUtils.convertToORCObject(NiFiOrcUtils.getOrcField(fieldSchema), o);
                                } catch (ArrayIndexOutOfBoundsException aioobe) {
                                    getLogger().error("Index out of bounds at record {} for column {}, type {}, and object {}", new Object[] { recordCount, i, fieldSchema.getType().getName(), o.toString() }, aioobe);
                                    throw new IOException(aioobe);
                                }
                            }
                            orcWriter.addRow(NiFiOrcUtils.createOrcStruct(orcSchema, row));
                            recordCount++;
                        }
                    }
                    hiveAvroSchema.set(avroSchema);
                    totalRecordCount.set(recordCount);
                } finally {
                    // finished writing this record, close the writer (which will flush to the flow file)
                    orcWriter.close();
                }
            }
        });
        final String hiveTableName = context.getProperty(HIVE_TABLE_NAME).isSet() ? context.getProperty(HIVE_TABLE_NAME).evaluateAttributeExpressions(flowFile).getValue() : NiFiOrcUtils.normalizeHiveTableName(hiveAvroSchema.get().getFullName());
        String hiveDDL = NiFiOrcUtils.generateHiveDDL(hiveAvroSchema.get(), hiveTableName);
        // Add attributes and transfer to success
        flowFile = session.putAttribute(flowFile, RECORD_COUNT_ATTRIBUTE, Integer.toString(totalRecordCount.get()));
        flowFile = session.putAttribute(flowFile, HIVE_DDL_ATTRIBUTE, hiveDDL);
        StringBuilder newFilename = new StringBuilder();
        int extensionIndex = fileName.lastIndexOf(".");
        if (extensionIndex != -1) {
            newFilename.append(fileName.substring(0, extensionIndex));
        } else {
            newFilename.append(fileName);
        }
        newFilename.append(".orc");
        flowFile = session.putAttribute(flowFile, CoreAttributes.MIME_TYPE.key(), ORC_MIME_TYPE);
        flowFile = session.putAttribute(flowFile, CoreAttributes.FILENAME.key(), newFilename.toString());
        session.transfer(flowFile, REL_SUCCESS);
        session.getProvenanceReporter().modifyContent(flowFile, "Converted " + totalRecordCount.get() + " records", System.currentTimeMillis() - startTime);
    } catch (final ProcessException pe) {
        getLogger().error("Failed to convert {} from Avro to ORC due to {}; transferring to failure", new Object[] { flowFile, pe });
        session.transfer(flowFile, REL_FAILURE);
    }
}
Also used : StandardValidators(org.apache.nifi.processor.util.StandardValidators) BufferedInputStream(java.io.BufferedInputStream) CapabilityDescription(org.apache.nifi.annotation.documentation.CapabilityDescription) SideEffectFree(org.apache.nifi.annotation.behavior.SideEffectFree) AtomicReference(java.util.concurrent.atomic.AtomicReference) PropertyDescriptor(org.apache.nifi.components.PropertyDescriptor) ProcessException(org.apache.nifi.processor.exception.ProcessException) NiFiOrcUtils(org.apache.hadoop.hive.ql.io.orc.NiFiOrcUtils) BufferedOutputStream(java.io.BufferedOutputStream) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) WritesAttributes(org.apache.nifi.annotation.behavior.WritesAttributes) Relationship(org.apache.nifi.processor.Relationship) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) Configuration(org.apache.hadoop.conf.Configuration) Path(org.apache.hadoop.fs.Path) CompressionKind(org.apache.hadoop.hive.ql.io.orc.CompressionKind) HiveJdbcCommon(org.apache.nifi.util.hive.HiveJdbcCommon) OutputStream(java.io.OutputStream) GenericRecord(org.apache.avro.generic.GenericRecord) Schema(org.apache.avro.Schema) FlowFile(org.apache.nifi.flowfile.FlowFile) ProcessContext(org.apache.nifi.processor.ProcessContext) DataFileStream(org.apache.avro.file.DataFileStream) Set(java.util.Set) OrcFlowFileWriter(org.apache.hadoop.hive.ql.io.orc.OrcFlowFileWriter) ProcessSession(org.apache.nifi.processor.ProcessSession) IOException(java.io.IOException) WritesAttribute(org.apache.nifi.annotation.behavior.WritesAttribute) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) HiveUtils(org.apache.nifi.util.hive.HiveUtils) InputRequirement(org.apache.nifi.annotation.behavior.InputRequirement) OnScheduled(org.apache.nifi.annotation.lifecycle.OnScheduled) List(java.util.List) SupportsBatching(org.apache.nifi.annotation.behavior.SupportsBatching) AbstractProcessor(org.apache.nifi.processor.AbstractProcessor) Tags(org.apache.nifi.annotation.documentation.Tags) DataUnit(org.apache.nifi.processor.DataUnit) CoreAttributes(org.apache.nifi.flowfile.attributes.CoreAttributes) Collections(java.util.Collections) GenericDatumReader(org.apache.avro.generic.GenericDatumReader) InputStream(java.io.InputStream) Configuration(org.apache.hadoop.conf.Configuration) GenericDatumReader(org.apache.avro.generic.GenericDatumReader) Schema(org.apache.avro.Schema) BufferedOutputStream(java.io.BufferedOutputStream) OutputStream(java.io.OutputStream) BufferedInputStream(java.io.BufferedInputStream) OrcFlowFileWriter(org.apache.hadoop.hive.ql.io.orc.OrcFlowFileWriter) ArrayList(java.util.ArrayList) List(java.util.List) GenericRecord(org.apache.avro.generic.GenericRecord) BufferedOutputStream(java.io.BufferedOutputStream) Path(org.apache.hadoop.fs.Path) FlowFile(org.apache.nifi.flowfile.FlowFile) CompressionKind(org.apache.hadoop.hive.ql.io.orc.CompressionKind) BufferedInputStream(java.io.BufferedInputStream) InputStream(java.io.InputStream) AtomicReference(java.util.concurrent.atomic.AtomicReference) IOException(java.io.IOException) DataFileStream(org.apache.avro.file.DataFileStream) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) ProcessException(org.apache.nifi.processor.exception.ProcessException) AtomicInteger(java.util.concurrent.atomic.AtomicInteger)

Example 37 with TypeInfo

use of org.apache.hadoop.hive.serde2.typeinfo.TypeInfo in project SQLWindowing by hbutani.

the class MRUtils method initialize.

/**
 * Construct the data structures containing ExprNodeDesc for partition
 * columns and order columns. Use the input definition to construct the list
 * of output columns for the ReduceSinkOperator
 *
 * @throws WindowingException
 */
public void initialize() throws WindowingException {
    TableFuncDef tabDef = RuntimeUtils.getFirstTableFunction(qdef);
    hiveTableDef = tabDef.getHiveTableDef();
    InputInfo inputInfo;
    ArrayList<ColumnDef> partColList = tabDef.getWindow().getPartDef().getColumns();
    TableFunctionEvaluator tEval = tabDef.getFunction();
    /*
		 * If the query has a map phase, the inputInfo is retrieved from the map
		 * output info of the table function definition. This is constructed
		 * using the map output oi of the table function definition. If the
		 * query does not have a map phase, the inputInfo is retrieved from the
		 * QueryInputDef (either HiveTableDef or HiveQueryDef) of the query.
		 */
    if (tEval.isTransformsRawInput()) {
        inputInfo = qdef.getTranslationInfo().getMapInputInfo(tabDef);
    } else {
        inputInfo = qdef.getTranslationInfo().getInputInfo(hiveTableDef);
    }
    for (ColumnDef colDef : partColList) {
        partCols.add(colDef.getExprNode());
    }
    ArrayList<OrderColumnDef> orderColList = tabDef.getWindow().getOrderDef().getColumns();
    for (OrderColumnDef colDef : orderColList) {
        Order order = colDef.getOrder();
        if (order.name().equals("ASC")) {
            orderString.append('+');
        } else {
            orderString.append('-');
        }
        orderCols.add(colDef.getExprNode());
        outputColumnNames.add(colDef.getAlias());
    }
    RowResolver rr = inputInfo.getRowResolver();
    ArrayList<ColumnInfo> colInfoList = rr.getColumnInfos();
    for (ColumnInfo colInfo : colInfoList) {
        String internalName = colInfo.getInternalName();
        TypeInfo type = colInfo.getType();
        valueCols.add(TranslateUtils.getExprDesc(internalName, type));
        outputColumnNames.add(internalName);
    }
}
Also used : Order(com.sap.hadoop.metadata.Order) OrderColumnDef(com.sap.hadoop.windowing.query2.definition.OrderColumnDef) ColumnDef(com.sap.hadoop.windowing.query2.definition.ColumnDef) OrderColumnDef(com.sap.hadoop.windowing.query2.definition.OrderColumnDef) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) RowResolver(org.apache.hadoop.hive.ql.parse.RowResolver) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) TableFuncDef(com.sap.hadoop.windowing.query2.definition.TableFuncDef) InputInfo(com.sap.hadoop.windowing.query2.translate.QueryTranslationInfo.InputInfo) TableFunctionEvaluator(com.sap.hadoop.windowing.functions2.TableFunctionEvaluator)

Example 38 with TypeInfo

use of org.apache.hadoop.hive.serde2.typeinfo.TypeInfo in project hive by apache.

the class SemanticAnalyzer method getColForInsertStmtSpec.

private RowResolver getColForInsertStmtSpec(Map<String, ExprNodeDesc> targetCol2Projection, final Table target, Map<String, ColumnInfo> targetCol2ColumnInfo, int colListPos, List<TypeInfo> targetTableColTypes, ArrayList<ExprNodeDesc> new_col_list, List<String> targetTableColNames) throws SemanticException {
    RowResolver newOutputRR = new RowResolver();
    Map<String, String> colNameToDefaultVal = null;
    // see if we need to fetch default constraints from metastore
    if (targetCol2Projection.size() < targetTableColNames.size()) {
        try {
            DefaultConstraint dc = Hive.get().getEnabledDefaultConstraints(target.getDbName(), target.getTableName());
            colNameToDefaultVal = dc.getColNameToDefaultValueMap();
        } catch (Exception e) {
            if (e instanceof SemanticException) {
                throw (SemanticException) e;
            } else {
                throw (new RuntimeException(e));
            }
        }
    }
    boolean defaultConstraintsFetch = true;
    for (int i = 0; i < targetTableColNames.size(); i++) {
        String f = targetTableColNames.get(i);
        if (targetCol2Projection.containsKey(f)) {
            // put existing column in new list to make sure it is in the right position
            new_col_list.add(targetCol2Projection.get(f));
            ColumnInfo ci = targetCol2ColumnInfo.get(f);
            ci.setInternalName(getColumnInternalName(colListPos));
            newOutputRR.put(ci.getTabAlias(), ci.getInternalName(), ci);
        } else {
            // add new 'synthetic' columns for projections not provided by Select
            assert (colNameToDefaultVal != null);
            ExprNodeDesc exp = null;
            if (colNameToDefaultVal.containsKey(f)) {
                // make an expression for default value
                String defaultValue = colNameToDefaultVal.get(f);
                ParseDriver parseDriver = new ParseDriver();
                try {
                    ASTNode defValAst = parseDriver.parseExpression(defaultValue);
                    exp = TypeCheckProcFactory.genExprNode(defValAst, new TypeCheckCtx(null)).get(defValAst);
                } catch (Exception e) {
                    throw new SemanticException("Error while parsing default value: " + defaultValue + ". Error message: " + e.getMessage());
                }
                LOG.debug("Added default value from metastore: " + exp);
            } else {
                exp = new ExprNodeConstantDesc(targetTableColTypes.get(i), null);
            }
            new_col_list.add(exp);
            // this column doesn't come from any table
            final String tableAlias = null;
            ColumnInfo colInfo = new ColumnInfo(getColumnInternalName(colListPos), exp.getWritableObjectInspector(), tableAlias, false);
            newOutputRR.put(colInfo.getTabAlias(), colInfo.getInternalName(), colInfo);
        }
        colListPos++;
    }
    return newOutputRR;
}
Also used : ExprNodeConstantDesc(org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) LockException(org.apache.hadoop.hive.ql.lockmgr.LockException) IOException(java.io.IOException) CalciteSemanticException(org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException) MetaException(org.apache.hadoop.hive.metastore.api.MetaException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) PatternSyntaxException(java.util.regex.PatternSyntaxException) FileNotFoundException(java.io.FileNotFoundException) AccessControlException(java.security.AccessControlException) InvalidTableException(org.apache.hadoop.hive.ql.metadata.InvalidTableException) SQLUniqueConstraint(org.apache.hadoop.hive.metastore.api.SQLUniqueConstraint) CheckConstraint(org.apache.hadoop.hive.ql.metadata.CheckConstraint) NotNullConstraint(org.apache.hadoop.hive.ql.metadata.NotNullConstraint) SQLCheckConstraint(org.apache.hadoop.hive.metastore.api.SQLCheckConstraint) SQLDefaultConstraint(org.apache.hadoop.hive.metastore.api.SQLDefaultConstraint) DefaultConstraint(org.apache.hadoop.hive.ql.metadata.DefaultConstraint) SQLNotNullConstraint(org.apache.hadoop.hive.metastore.api.SQLNotNullConstraint) SQLDefaultConstraint(org.apache.hadoop.hive.metastore.api.SQLDefaultConstraint) DefaultConstraint(org.apache.hadoop.hive.ql.metadata.DefaultConstraint) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) CalciteSemanticException(org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException)

Example 39 with TypeInfo

use of org.apache.hadoop.hive.serde2.typeinfo.TypeInfo in project hive by apache.

the class GenMRSkewJoinProcessor method processSkewJoin.

/**
 * Create tasks for processing skew joins. The idea is (HIVE-964) to use
 * separated jobs and map-joins to handle skew joins.
 * <p>
 * <ul>
 * <li>
 * Number of mr jobs to handle skew keys is the number of table minus 1 (we
 * can stream the last table, so big keys in the last table will not be a
 * problem).
 * <li>
 * At runtime in Join, we output big keys in one table into one corresponding
 * directories, and all same keys in other tables into different dirs(one for
 * each table). The directories will look like:
 * <ul>
 * <li>
 * dir-T1-bigkeys(containing big keys in T1), dir-T2-keys(containing keys
 * which is big in T1),dir-T3-keys(containing keys which is big in T1), ...
 * <li>
 * dir-T1-keys(containing keys which is big in T2), dir-T2-bigkeys(containing
 * big keys in T2),dir-T3-keys(containing keys which is big in T2), ...
 * <li>
 * dir-T1-keys(containing keys which is big in T3), dir-T2-keys(containing big
 * keys in T3),dir-T3-bigkeys(containing keys which is big in T3), ... .....
 * </ul>
 * </ul>
 * For each table, we launch one mapjoin job, taking the directory containing
 * big keys in this table and corresponding dirs in other tables as input.
 * (Actally one job for one row in the above.)
 *
 * <p>
 * For more discussions, please check
 * https://issues.apache.org/jira/browse/HIVE-964.
 */
@SuppressWarnings("unchecked")
public static void processSkewJoin(JoinOperator joinOp, Task<? extends Serializable> currTask, ParseContext parseCtx) throws SemanticException {
    // now does not work with outer joins
    if (!GenMRSkewJoinProcessor.skewJoinEnabled(parseCtx.getConf(), joinOp)) {
        return;
    }
    List<Task<? extends Serializable>> children = currTask.getChildTasks();
    Path baseTmpDir = parseCtx.getContext().getMRTmpPath();
    JoinDesc joinDescriptor = joinOp.getConf();
    Map<Byte, List<ExprNodeDesc>> joinValues = joinDescriptor.getExprs();
    int numAliases = joinValues.size();
    Map<Byte, Path> bigKeysDirMap = new HashMap<Byte, Path>();
    Map<Byte, Map<Byte, Path>> smallKeysDirMap = new HashMap<Byte, Map<Byte, Path>>();
    Map<Byte, Path> skewJoinJobResultsDir = new HashMap<Byte, Path>();
    Byte[] tags = joinDescriptor.getTagOrder();
    for (int i = 0; i < numAliases; i++) {
        Byte alias = tags[i];
        bigKeysDirMap.put(alias, getBigKeysDir(baseTmpDir, alias));
        Map<Byte, Path> smallKeysMap = new HashMap<Byte, Path>();
        smallKeysDirMap.put(alias, smallKeysMap);
        for (Byte src2 : tags) {
            if (!src2.equals(alias)) {
                smallKeysMap.put(src2, getSmallKeysDir(baseTmpDir, alias, src2));
            }
        }
        skewJoinJobResultsDir.put(alias, getBigKeysSkewJoinResultDir(baseTmpDir, alias));
    }
    joinDescriptor.setHandleSkewJoin(true);
    joinDescriptor.setBigKeysDirMap(bigKeysDirMap);
    joinDescriptor.setSmallKeysDirMap(smallKeysDirMap);
    joinDescriptor.setSkewKeyDefinition(HiveConf.getIntVar(parseCtx.getConf(), HiveConf.ConfVars.HIVESKEWJOINKEY));
    HashMap<Path, Task<? extends Serializable>> bigKeysDirToTaskMap = new HashMap<Path, Task<? extends Serializable>>();
    List<Serializable> listWorks = new ArrayList<Serializable>();
    List<Task<? extends Serializable>> listTasks = new ArrayList<Task<? extends Serializable>>();
    MapredWork currPlan = (MapredWork) currTask.getWork();
    TableDesc keyTblDesc = (TableDesc) currPlan.getReduceWork().getKeyDesc().clone();
    List<String> joinKeys = Utilities.getColumnNames(keyTblDesc.getProperties());
    List<String> joinKeyTypes = Utilities.getColumnTypes(keyTblDesc.getProperties());
    Map<Byte, TableDesc> tableDescList = new HashMap<Byte, TableDesc>();
    Map<Byte, RowSchema> rowSchemaList = new HashMap<Byte, RowSchema>();
    Map<Byte, List<ExprNodeDesc>> newJoinValues = new HashMap<Byte, List<ExprNodeDesc>>();
    Map<Byte, List<ExprNodeDesc>> newJoinKeys = new HashMap<Byte, List<ExprNodeDesc>>();
    // used for create mapJoinDesc, should be in order
    List<TableDesc> newJoinValueTblDesc = new ArrayList<TableDesc>();
    for (Byte tag : tags) {
        newJoinValueTblDesc.add(null);
    }
    for (int i = 0; i < numAliases; i++) {
        Byte alias = tags[i];
        List<ExprNodeDesc> valueCols = joinValues.get(alias);
        String colNames = "";
        String colTypes = "";
        int columnSize = valueCols.size();
        List<ExprNodeDesc> newValueExpr = new ArrayList<ExprNodeDesc>();
        List<ExprNodeDesc> newKeyExpr = new ArrayList<ExprNodeDesc>();
        ArrayList<ColumnInfo> columnInfos = new ArrayList<ColumnInfo>();
        boolean first = true;
        for (int k = 0; k < columnSize; k++) {
            TypeInfo type = valueCols.get(k).getTypeInfo();
            // any name, it does not matter.
            String newColName = i + "_VALUE_" + k;
            ColumnInfo columnInfo = new ColumnInfo(newColName, type, alias.toString(), false);
            columnInfos.add(columnInfo);
            newValueExpr.add(new ExprNodeColumnDesc(columnInfo));
            if (!first) {
                colNames = colNames + ",";
                colTypes = colTypes + ",";
            }
            first = false;
            colNames = colNames + newColName;
            colTypes = colTypes + valueCols.get(k).getTypeString();
        }
        // we are putting join keys at last part of the spilled table
        for (int k = 0; k < joinKeys.size(); k++) {
            if (!first) {
                colNames = colNames + ",";
                colTypes = colTypes + ",";
            }
            first = false;
            colNames = colNames + joinKeys.get(k);
            colTypes = colTypes + joinKeyTypes.get(k);
            ColumnInfo columnInfo = new ColumnInfo(joinKeys.get(k), TypeInfoFactory.getPrimitiveTypeInfo(joinKeyTypes.get(k)), alias.toString(), false);
            columnInfos.add(columnInfo);
            newKeyExpr.add(new ExprNodeColumnDesc(columnInfo));
        }
        newJoinValues.put(alias, newValueExpr);
        newJoinKeys.put(alias, newKeyExpr);
        tableDescList.put(alias, Utilities.getTableDesc(colNames, colTypes));
        rowSchemaList.put(alias, new RowSchema(columnInfos));
        // construct value table Desc
        String valueColNames = "";
        String valueColTypes = "";
        first = true;
        for (int k = 0; k < columnSize; k++) {
            // any name, it does not matter.
            String newColName = i + "_VALUE_" + k;
            if (!first) {
                valueColNames = valueColNames + ",";
                valueColTypes = valueColTypes + ",";
            }
            valueColNames = valueColNames + newColName;
            valueColTypes = valueColTypes + valueCols.get(k).getTypeString();
            first = false;
        }
        newJoinValueTblDesc.set(Byte.valueOf((byte) i), Utilities.getTableDesc(valueColNames, valueColTypes));
    }
    joinDescriptor.setSkewKeysValuesTables(tableDescList);
    joinDescriptor.setKeyTableDesc(keyTblDesc);
    for (int i = 0; i < numAliases - 1; i++) {
        Byte src = tags[i];
        MapWork newPlan = PlanUtils.getMapRedWork().getMapWork();
        // This code has been only added for testing
        boolean mapperCannotSpanPartns = parseCtx.getConf().getBoolVar(HiveConf.ConfVars.HIVE_MAPPER_CANNOT_SPAN_MULTIPLE_PARTITIONS);
        newPlan.setMapperCannotSpanPartns(mapperCannotSpanPartns);
        MapredWork clonePlan = SerializationUtilities.clonePlan(currPlan);
        Operator<? extends OperatorDesc>[] parentOps = new TableScanOperator[tags.length];
        for (int k = 0; k < tags.length; k++) {
            Operator<? extends OperatorDesc> ts = GenMapRedUtils.createTemporaryTableScanOperator(joinOp.getCompilationOpContext(), rowSchemaList.get((byte) k));
            ((TableScanOperator) ts).setTableDescSkewJoin(tableDescList.get((byte) k));
            parentOps[k] = ts;
        }
        Operator<? extends OperatorDesc> tblScan_op = parentOps[i];
        ArrayList<String> aliases = new ArrayList<String>();
        String alias = src.toString().intern();
        aliases.add(alias);
        Path bigKeyDirPath = bigKeysDirMap.get(src);
        newPlan.addPathToAlias(bigKeyDirPath, aliases);
        newPlan.getAliasToWork().put(alias, tblScan_op);
        PartitionDesc part = new PartitionDesc(tableDescList.get(src), null);
        newPlan.addPathToPartitionInfo(bigKeyDirPath, part);
        newPlan.getAliasToPartnInfo().put(alias, part);
        Operator<? extends OperatorDesc> reducer = clonePlan.getReduceWork().getReducer();
        assert reducer instanceof JoinOperator;
        JoinOperator cloneJoinOp = (JoinOperator) reducer;
        String dumpFilePrefix = "mapfile" + PlanUtils.getCountForMapJoinDumpFilePrefix();
        MapJoinDesc mapJoinDescriptor = new MapJoinDesc(newJoinKeys, keyTblDesc, newJoinValues, newJoinValueTblDesc, newJoinValueTblDesc, joinDescriptor.getOutputColumnNames(), i, joinDescriptor.getConds(), joinDescriptor.getFilters(), joinDescriptor.getNoOuterJoin(), dumpFilePrefix, joinDescriptor.getMemoryMonitorInfo(), joinDescriptor.getInMemoryDataSize());
        mapJoinDescriptor.setTagOrder(tags);
        mapJoinDescriptor.setHandleSkewJoin(false);
        mapJoinDescriptor.setNullSafes(joinDescriptor.getNullSafes());
        mapJoinDescriptor.setColumnExprMap(joinDescriptor.getColumnExprMap());
        MapredLocalWork localPlan = new MapredLocalWork(new LinkedHashMap<String, Operator<? extends OperatorDesc>>(), new LinkedHashMap<String, FetchWork>());
        Map<Byte, Path> smallTblDirs = smallKeysDirMap.get(src);
        for (int j = 0; j < numAliases; j++) {
            if (j == i) {
                continue;
            }
            Byte small_alias = tags[j];
            Operator<? extends OperatorDesc> tblScan_op2 = parentOps[j];
            localPlan.getAliasToWork().put(small_alias.toString(), tblScan_op2);
            Path tblDir = smallTblDirs.get(small_alias);
            localPlan.getAliasToFetchWork().put(small_alias.toString(), new FetchWork(tblDir, tableDescList.get(small_alias)));
        }
        newPlan.setMapRedLocalWork(localPlan);
        // construct a map join and set it as the child operator of tblScan_op
        MapJoinOperator mapJoinOp = (MapJoinOperator) OperatorFactory.getAndMakeChild(joinOp.getCompilationOpContext(), mapJoinDescriptor, (RowSchema) null, parentOps);
        // change the children of the original join operator to point to the map
        // join operator
        List<Operator<? extends OperatorDesc>> childOps = cloneJoinOp.getChildOperators();
        for (Operator<? extends OperatorDesc> childOp : childOps) {
            childOp.replaceParent(cloneJoinOp, mapJoinOp);
        }
        mapJoinOp.setChildOperators(childOps);
        HiveConf jc = new HiveConf(parseCtx.getConf(), GenMRSkewJoinProcessor.class);
        newPlan.setNumMapTasks(HiveConf.getIntVar(jc, HiveConf.ConfVars.HIVESKEWJOINMAPJOINNUMMAPTASK));
        newPlan.setMinSplitSize(HiveConf.getLongVar(jc, HiveConf.ConfVars.HIVESKEWJOINMAPJOINMINSPLIT));
        newPlan.setInputformat(HiveInputFormat.class.getName());
        MapredWork w = new MapredWork();
        w.setMapWork(newPlan);
        Task<? extends Serializable> skewJoinMapJoinTask = TaskFactory.get(w);
        skewJoinMapJoinTask.setFetchSource(currTask.isFetchSource());
        bigKeysDirToTaskMap.put(bigKeyDirPath, skewJoinMapJoinTask);
        listWorks.add(skewJoinMapJoinTask.getWork());
        listTasks.add(skewJoinMapJoinTask);
    }
    if (children != null) {
        for (Task<? extends Serializable> tsk : listTasks) {
            for (Task<? extends Serializable> oldChild : children) {
                tsk.addDependentTask(oldChild);
            }
        }
        currTask.setChildTasks(new ArrayList<Task<? extends Serializable>>());
        for (Task<? extends Serializable> oldChild : children) {
            oldChild.getParentTasks().remove(currTask);
        }
        listTasks.addAll(children);
    }
    ConditionalResolverSkewJoinCtx context = new ConditionalResolverSkewJoinCtx(bigKeysDirToTaskMap, children);
    ConditionalWork cndWork = new ConditionalWork(listWorks);
    ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork);
    cndTsk.setListTasks(listTasks);
    cndTsk.setResolver(new ConditionalResolverSkewJoin());
    cndTsk.setResolverCtx(context);
    currTask.setChildTasks(new ArrayList<Task<? extends Serializable>>());
    currTask.addDependentTask(cndTsk);
    return;
}
Also used : MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) Task(org.apache.hadoop.hive.ql.exec.Task) Serializable(java.io.Serializable) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) ArrayList(java.util.ArrayList) ConditionalWork(org.apache.hadoop.hive.ql.plan.ConditionalWork) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) ArrayList(java.util.ArrayList) List(java.util.List) HiveConf(org.apache.hadoop.hive.conf.HiveConf) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) ConditionalResolverSkewJoin(org.apache.hadoop.hive.ql.plan.ConditionalResolverSkewJoin) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) MapredLocalWork(org.apache.hadoop.hive.ql.plan.MapredLocalWork) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) MapJoinDesc(org.apache.hadoop.hive.ql.plan.MapJoinDesc) JoinDesc(org.apache.hadoop.hive.ql.plan.JoinDesc) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) HiveInputFormat(org.apache.hadoop.hive.ql.io.HiveInputFormat) ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) Path(org.apache.hadoop.fs.Path) MapJoinDesc(org.apache.hadoop.hive.ql.plan.MapJoinDesc) ConditionalResolverSkewJoinCtx(org.apache.hadoop.hive.ql.plan.ConditionalResolverSkewJoin.ConditionalResolverSkewJoinCtx) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) FetchWork(org.apache.hadoop.hive.ql.plan.FetchWork) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc)

Example 40 with TypeInfo

use of org.apache.hadoop.hive.serde2.typeinfo.TypeInfo in project hive by apache.

the class VectorPTFOperator method allocateOverflowBatchColumnVector.

/*
   * Allocate overflow batch columns by hand.
   */
private void allocateOverflowBatchColumnVector(VectorizedRowBatch overflowBatch, int outputColumn, String typeName) throws HiveException {
    if (overflowBatch.cols[outputColumn] == null) {
        typeName = VectorizationContext.mapTypeNameSynonyms(typeName);
        TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(typeName);
        overflowBatch.cols[outputColumn] = VectorizedBatchUtil.createColumnVector(typeInfo);
    }
}
Also used : PrimitiveTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo)

Aggregations

TypeInfo (org.apache.hadoop.hive.serde2.typeinfo.TypeInfo)516 PrimitiveTypeInfo (org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo)287 ArrayList (java.util.ArrayList)202 StructTypeInfo (org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo)193 DecimalTypeInfo (org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo)167 ListTypeInfo (org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo)151 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)148 MapTypeInfo (org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo)138 Test (org.junit.Test)135 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)107 UnionTypeInfo (org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo)78 HashMap (java.util.HashMap)74 PrimitiveCategory (org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory)71 CharTypeInfo (org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo)69 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)67 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)63 ExprNodeColumnDesc (org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc)61 VarcharTypeInfo (org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo)59 List (java.util.List)54 HiveConf (org.apache.hadoop.hive.conf.HiveConf)53