use of org.apache.hadoop.hive.serde2.typeinfo.TypeInfo in project nifi by apache.
the class ConvertAvroToORC method onTrigger.
@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
FlowFile flowFile = session.get();
if (flowFile == null) {
return;
}
try {
long startTime = System.currentTimeMillis();
final long stripeSize = context.getProperty(STRIPE_SIZE).asDataSize(DataUnit.B).longValue();
final int bufferSize = context.getProperty(BUFFER_SIZE).asDataSize(DataUnit.B).intValue();
final CompressionKind compressionType = CompressionKind.valueOf(context.getProperty(COMPRESSION_TYPE).getValue());
final AtomicReference<Schema> hiveAvroSchema = new AtomicReference<>(null);
final AtomicInteger totalRecordCount = new AtomicInteger(0);
final String fileName = flowFile.getAttribute(CoreAttributes.FILENAME.key());
flowFile = session.write(flowFile, (rawIn, rawOut) -> {
try (final InputStream in = new BufferedInputStream(rawIn);
final OutputStream out = new BufferedOutputStream(rawOut);
final DataFileStream<GenericRecord> reader = new DataFileStream<>(in, new GenericDatumReader<>())) {
// Create ORC schema from Avro schema
Schema avroSchema = reader.getSchema();
TypeInfo orcSchema = NiFiOrcUtils.getOrcField(avroSchema);
if (orcConfig == null) {
orcConfig = new Configuration();
}
OrcFlowFileWriter orcWriter = NiFiOrcUtils.createWriter(out, new Path(fileName), orcConfig, orcSchema, stripeSize, compressionType, bufferSize);
try {
int recordCount = 0;
GenericRecord currRecord = null;
while (reader.hasNext()) {
currRecord = reader.next(currRecord);
List<Schema.Field> fields = currRecord.getSchema().getFields();
if (fields != null) {
Object[] row = new Object[fields.size()];
for (int i = 0; i < fields.size(); i++) {
Schema.Field field = fields.get(i);
Schema fieldSchema = field.schema();
Object o = currRecord.get(field.name());
try {
row[i] = NiFiOrcUtils.convertToORCObject(NiFiOrcUtils.getOrcField(fieldSchema), o);
} catch (ArrayIndexOutOfBoundsException aioobe) {
getLogger().error("Index out of bounds at record {} for column {}, type {}, and object {}", new Object[] { recordCount, i, fieldSchema.getType().getName(), o.toString() }, aioobe);
throw new IOException(aioobe);
}
}
orcWriter.addRow(NiFiOrcUtils.createOrcStruct(orcSchema, row));
recordCount++;
}
}
hiveAvroSchema.set(avroSchema);
totalRecordCount.set(recordCount);
} finally {
// finished writing this record, close the writer (which will flush to the flow file)
orcWriter.close();
}
}
});
final String hiveTableName = context.getProperty(HIVE_TABLE_NAME).isSet() ? context.getProperty(HIVE_TABLE_NAME).evaluateAttributeExpressions(flowFile).getValue() : NiFiOrcUtils.normalizeHiveTableName(hiveAvroSchema.get().getFullName());
String hiveDDL = NiFiOrcUtils.generateHiveDDL(hiveAvroSchema.get(), hiveTableName);
// Add attributes and transfer to success
flowFile = session.putAttribute(flowFile, RECORD_COUNT_ATTRIBUTE, Integer.toString(totalRecordCount.get()));
flowFile = session.putAttribute(flowFile, HIVE_DDL_ATTRIBUTE, hiveDDL);
StringBuilder newFilename = new StringBuilder();
int extensionIndex = fileName.lastIndexOf(".");
if (extensionIndex != -1) {
newFilename.append(fileName.substring(0, extensionIndex));
} else {
newFilename.append(fileName);
}
newFilename.append(".orc");
flowFile = session.putAttribute(flowFile, CoreAttributes.MIME_TYPE.key(), ORC_MIME_TYPE);
flowFile = session.putAttribute(flowFile, CoreAttributes.FILENAME.key(), newFilename.toString());
session.transfer(flowFile, REL_SUCCESS);
session.getProvenanceReporter().modifyContent(flowFile, "Converted " + totalRecordCount.get() + " records", System.currentTimeMillis() - startTime);
} catch (final ProcessException pe) {
getLogger().error("Failed to convert {} from Avro to ORC due to {}; transferring to failure", new Object[] { flowFile, pe });
session.transfer(flowFile, REL_FAILURE);
}
}
use of org.apache.hadoop.hive.serde2.typeinfo.TypeInfo in project SQLWindowing by hbutani.
the class MRUtils method initialize.
/**
* Construct the data structures containing ExprNodeDesc for partition
* columns and order columns. Use the input definition to construct the list
* of output columns for the ReduceSinkOperator
*
* @throws WindowingException
*/
public void initialize() throws WindowingException {
TableFuncDef tabDef = RuntimeUtils.getFirstTableFunction(qdef);
hiveTableDef = tabDef.getHiveTableDef();
InputInfo inputInfo;
ArrayList<ColumnDef> partColList = tabDef.getWindow().getPartDef().getColumns();
TableFunctionEvaluator tEval = tabDef.getFunction();
/*
* If the query has a map phase, the inputInfo is retrieved from the map
* output info of the table function definition. This is constructed
* using the map output oi of the table function definition. If the
* query does not have a map phase, the inputInfo is retrieved from the
* QueryInputDef (either HiveTableDef or HiveQueryDef) of the query.
*/
if (tEval.isTransformsRawInput()) {
inputInfo = qdef.getTranslationInfo().getMapInputInfo(tabDef);
} else {
inputInfo = qdef.getTranslationInfo().getInputInfo(hiveTableDef);
}
for (ColumnDef colDef : partColList) {
partCols.add(colDef.getExprNode());
}
ArrayList<OrderColumnDef> orderColList = tabDef.getWindow().getOrderDef().getColumns();
for (OrderColumnDef colDef : orderColList) {
Order order = colDef.getOrder();
if (order.name().equals("ASC")) {
orderString.append('+');
} else {
orderString.append('-');
}
orderCols.add(colDef.getExprNode());
outputColumnNames.add(colDef.getAlias());
}
RowResolver rr = inputInfo.getRowResolver();
ArrayList<ColumnInfo> colInfoList = rr.getColumnInfos();
for (ColumnInfo colInfo : colInfoList) {
String internalName = colInfo.getInternalName();
TypeInfo type = colInfo.getType();
valueCols.add(TranslateUtils.getExprDesc(internalName, type));
outputColumnNames.add(internalName);
}
}
use of org.apache.hadoop.hive.serde2.typeinfo.TypeInfo in project hive by apache.
the class SemanticAnalyzer method getColForInsertStmtSpec.
private RowResolver getColForInsertStmtSpec(Map<String, ExprNodeDesc> targetCol2Projection, final Table target, Map<String, ColumnInfo> targetCol2ColumnInfo, int colListPos, List<TypeInfo> targetTableColTypes, ArrayList<ExprNodeDesc> new_col_list, List<String> targetTableColNames) throws SemanticException {
RowResolver newOutputRR = new RowResolver();
Map<String, String> colNameToDefaultVal = null;
// see if we need to fetch default constraints from metastore
if (targetCol2Projection.size() < targetTableColNames.size()) {
try {
DefaultConstraint dc = Hive.get().getEnabledDefaultConstraints(target.getDbName(), target.getTableName());
colNameToDefaultVal = dc.getColNameToDefaultValueMap();
} catch (Exception e) {
if (e instanceof SemanticException) {
throw (SemanticException) e;
} else {
throw (new RuntimeException(e));
}
}
}
boolean defaultConstraintsFetch = true;
for (int i = 0; i < targetTableColNames.size(); i++) {
String f = targetTableColNames.get(i);
if (targetCol2Projection.containsKey(f)) {
// put existing column in new list to make sure it is in the right position
new_col_list.add(targetCol2Projection.get(f));
ColumnInfo ci = targetCol2ColumnInfo.get(f);
ci.setInternalName(getColumnInternalName(colListPos));
newOutputRR.put(ci.getTabAlias(), ci.getInternalName(), ci);
} else {
// add new 'synthetic' columns for projections not provided by Select
assert (colNameToDefaultVal != null);
ExprNodeDesc exp = null;
if (colNameToDefaultVal.containsKey(f)) {
// make an expression for default value
String defaultValue = colNameToDefaultVal.get(f);
ParseDriver parseDriver = new ParseDriver();
try {
ASTNode defValAst = parseDriver.parseExpression(defaultValue);
exp = TypeCheckProcFactory.genExprNode(defValAst, new TypeCheckCtx(null)).get(defValAst);
} catch (Exception e) {
throw new SemanticException("Error while parsing default value: " + defaultValue + ". Error message: " + e.getMessage());
}
LOG.debug("Added default value from metastore: " + exp);
} else {
exp = new ExprNodeConstantDesc(targetTableColTypes.get(i), null);
}
new_col_list.add(exp);
// this column doesn't come from any table
final String tableAlias = null;
ColumnInfo colInfo = new ColumnInfo(getColumnInternalName(colListPos), exp.getWritableObjectInspector(), tableAlias, false);
newOutputRR.put(colInfo.getTabAlias(), colInfo.getInternalName(), colInfo);
}
colListPos++;
}
return newOutputRR;
}
use of org.apache.hadoop.hive.serde2.typeinfo.TypeInfo in project hive by apache.
the class GenMRSkewJoinProcessor method processSkewJoin.
/**
* Create tasks for processing skew joins. The idea is (HIVE-964) to use
* separated jobs and map-joins to handle skew joins.
* <p>
* <ul>
* <li>
* Number of mr jobs to handle skew keys is the number of table minus 1 (we
* can stream the last table, so big keys in the last table will not be a
* problem).
* <li>
* At runtime in Join, we output big keys in one table into one corresponding
* directories, and all same keys in other tables into different dirs(one for
* each table). The directories will look like:
* <ul>
* <li>
* dir-T1-bigkeys(containing big keys in T1), dir-T2-keys(containing keys
* which is big in T1),dir-T3-keys(containing keys which is big in T1), ...
* <li>
* dir-T1-keys(containing keys which is big in T2), dir-T2-bigkeys(containing
* big keys in T2),dir-T3-keys(containing keys which is big in T2), ...
* <li>
* dir-T1-keys(containing keys which is big in T3), dir-T2-keys(containing big
* keys in T3),dir-T3-bigkeys(containing keys which is big in T3), ... .....
* </ul>
* </ul>
* For each table, we launch one mapjoin job, taking the directory containing
* big keys in this table and corresponding dirs in other tables as input.
* (Actally one job for one row in the above.)
*
* <p>
* For more discussions, please check
* https://issues.apache.org/jira/browse/HIVE-964.
*/
@SuppressWarnings("unchecked")
public static void processSkewJoin(JoinOperator joinOp, Task<? extends Serializable> currTask, ParseContext parseCtx) throws SemanticException {
// now does not work with outer joins
if (!GenMRSkewJoinProcessor.skewJoinEnabled(parseCtx.getConf(), joinOp)) {
return;
}
List<Task<? extends Serializable>> children = currTask.getChildTasks();
Path baseTmpDir = parseCtx.getContext().getMRTmpPath();
JoinDesc joinDescriptor = joinOp.getConf();
Map<Byte, List<ExprNodeDesc>> joinValues = joinDescriptor.getExprs();
int numAliases = joinValues.size();
Map<Byte, Path> bigKeysDirMap = new HashMap<Byte, Path>();
Map<Byte, Map<Byte, Path>> smallKeysDirMap = new HashMap<Byte, Map<Byte, Path>>();
Map<Byte, Path> skewJoinJobResultsDir = new HashMap<Byte, Path>();
Byte[] tags = joinDescriptor.getTagOrder();
for (int i = 0; i < numAliases; i++) {
Byte alias = tags[i];
bigKeysDirMap.put(alias, getBigKeysDir(baseTmpDir, alias));
Map<Byte, Path> smallKeysMap = new HashMap<Byte, Path>();
smallKeysDirMap.put(alias, smallKeysMap);
for (Byte src2 : tags) {
if (!src2.equals(alias)) {
smallKeysMap.put(src2, getSmallKeysDir(baseTmpDir, alias, src2));
}
}
skewJoinJobResultsDir.put(alias, getBigKeysSkewJoinResultDir(baseTmpDir, alias));
}
joinDescriptor.setHandleSkewJoin(true);
joinDescriptor.setBigKeysDirMap(bigKeysDirMap);
joinDescriptor.setSmallKeysDirMap(smallKeysDirMap);
joinDescriptor.setSkewKeyDefinition(HiveConf.getIntVar(parseCtx.getConf(), HiveConf.ConfVars.HIVESKEWJOINKEY));
HashMap<Path, Task<? extends Serializable>> bigKeysDirToTaskMap = new HashMap<Path, Task<? extends Serializable>>();
List<Serializable> listWorks = new ArrayList<Serializable>();
List<Task<? extends Serializable>> listTasks = new ArrayList<Task<? extends Serializable>>();
MapredWork currPlan = (MapredWork) currTask.getWork();
TableDesc keyTblDesc = (TableDesc) currPlan.getReduceWork().getKeyDesc().clone();
List<String> joinKeys = Utilities.getColumnNames(keyTblDesc.getProperties());
List<String> joinKeyTypes = Utilities.getColumnTypes(keyTblDesc.getProperties());
Map<Byte, TableDesc> tableDescList = new HashMap<Byte, TableDesc>();
Map<Byte, RowSchema> rowSchemaList = new HashMap<Byte, RowSchema>();
Map<Byte, List<ExprNodeDesc>> newJoinValues = new HashMap<Byte, List<ExprNodeDesc>>();
Map<Byte, List<ExprNodeDesc>> newJoinKeys = new HashMap<Byte, List<ExprNodeDesc>>();
// used for create mapJoinDesc, should be in order
List<TableDesc> newJoinValueTblDesc = new ArrayList<TableDesc>();
for (Byte tag : tags) {
newJoinValueTblDesc.add(null);
}
for (int i = 0; i < numAliases; i++) {
Byte alias = tags[i];
List<ExprNodeDesc> valueCols = joinValues.get(alias);
String colNames = "";
String colTypes = "";
int columnSize = valueCols.size();
List<ExprNodeDesc> newValueExpr = new ArrayList<ExprNodeDesc>();
List<ExprNodeDesc> newKeyExpr = new ArrayList<ExprNodeDesc>();
ArrayList<ColumnInfo> columnInfos = new ArrayList<ColumnInfo>();
boolean first = true;
for (int k = 0; k < columnSize; k++) {
TypeInfo type = valueCols.get(k).getTypeInfo();
// any name, it does not matter.
String newColName = i + "_VALUE_" + k;
ColumnInfo columnInfo = new ColumnInfo(newColName, type, alias.toString(), false);
columnInfos.add(columnInfo);
newValueExpr.add(new ExprNodeColumnDesc(columnInfo));
if (!first) {
colNames = colNames + ",";
colTypes = colTypes + ",";
}
first = false;
colNames = colNames + newColName;
colTypes = colTypes + valueCols.get(k).getTypeString();
}
// we are putting join keys at last part of the spilled table
for (int k = 0; k < joinKeys.size(); k++) {
if (!first) {
colNames = colNames + ",";
colTypes = colTypes + ",";
}
first = false;
colNames = colNames + joinKeys.get(k);
colTypes = colTypes + joinKeyTypes.get(k);
ColumnInfo columnInfo = new ColumnInfo(joinKeys.get(k), TypeInfoFactory.getPrimitiveTypeInfo(joinKeyTypes.get(k)), alias.toString(), false);
columnInfos.add(columnInfo);
newKeyExpr.add(new ExprNodeColumnDesc(columnInfo));
}
newJoinValues.put(alias, newValueExpr);
newJoinKeys.put(alias, newKeyExpr);
tableDescList.put(alias, Utilities.getTableDesc(colNames, colTypes));
rowSchemaList.put(alias, new RowSchema(columnInfos));
// construct value table Desc
String valueColNames = "";
String valueColTypes = "";
first = true;
for (int k = 0; k < columnSize; k++) {
// any name, it does not matter.
String newColName = i + "_VALUE_" + k;
if (!first) {
valueColNames = valueColNames + ",";
valueColTypes = valueColTypes + ",";
}
valueColNames = valueColNames + newColName;
valueColTypes = valueColTypes + valueCols.get(k).getTypeString();
first = false;
}
newJoinValueTblDesc.set(Byte.valueOf((byte) i), Utilities.getTableDesc(valueColNames, valueColTypes));
}
joinDescriptor.setSkewKeysValuesTables(tableDescList);
joinDescriptor.setKeyTableDesc(keyTblDesc);
for (int i = 0; i < numAliases - 1; i++) {
Byte src = tags[i];
MapWork newPlan = PlanUtils.getMapRedWork().getMapWork();
// This code has been only added for testing
boolean mapperCannotSpanPartns = parseCtx.getConf().getBoolVar(HiveConf.ConfVars.HIVE_MAPPER_CANNOT_SPAN_MULTIPLE_PARTITIONS);
newPlan.setMapperCannotSpanPartns(mapperCannotSpanPartns);
MapredWork clonePlan = SerializationUtilities.clonePlan(currPlan);
Operator<? extends OperatorDesc>[] parentOps = new TableScanOperator[tags.length];
for (int k = 0; k < tags.length; k++) {
Operator<? extends OperatorDesc> ts = GenMapRedUtils.createTemporaryTableScanOperator(joinOp.getCompilationOpContext(), rowSchemaList.get((byte) k));
((TableScanOperator) ts).setTableDescSkewJoin(tableDescList.get((byte) k));
parentOps[k] = ts;
}
Operator<? extends OperatorDesc> tblScan_op = parentOps[i];
ArrayList<String> aliases = new ArrayList<String>();
String alias = src.toString().intern();
aliases.add(alias);
Path bigKeyDirPath = bigKeysDirMap.get(src);
newPlan.addPathToAlias(bigKeyDirPath, aliases);
newPlan.getAliasToWork().put(alias, tblScan_op);
PartitionDesc part = new PartitionDesc(tableDescList.get(src), null);
newPlan.addPathToPartitionInfo(bigKeyDirPath, part);
newPlan.getAliasToPartnInfo().put(alias, part);
Operator<? extends OperatorDesc> reducer = clonePlan.getReduceWork().getReducer();
assert reducer instanceof JoinOperator;
JoinOperator cloneJoinOp = (JoinOperator) reducer;
String dumpFilePrefix = "mapfile" + PlanUtils.getCountForMapJoinDumpFilePrefix();
MapJoinDesc mapJoinDescriptor = new MapJoinDesc(newJoinKeys, keyTblDesc, newJoinValues, newJoinValueTblDesc, newJoinValueTblDesc, joinDescriptor.getOutputColumnNames(), i, joinDescriptor.getConds(), joinDescriptor.getFilters(), joinDescriptor.getNoOuterJoin(), dumpFilePrefix, joinDescriptor.getMemoryMonitorInfo(), joinDescriptor.getInMemoryDataSize());
mapJoinDescriptor.setTagOrder(tags);
mapJoinDescriptor.setHandleSkewJoin(false);
mapJoinDescriptor.setNullSafes(joinDescriptor.getNullSafes());
mapJoinDescriptor.setColumnExprMap(joinDescriptor.getColumnExprMap());
MapredLocalWork localPlan = new MapredLocalWork(new LinkedHashMap<String, Operator<? extends OperatorDesc>>(), new LinkedHashMap<String, FetchWork>());
Map<Byte, Path> smallTblDirs = smallKeysDirMap.get(src);
for (int j = 0; j < numAliases; j++) {
if (j == i) {
continue;
}
Byte small_alias = tags[j];
Operator<? extends OperatorDesc> tblScan_op2 = parentOps[j];
localPlan.getAliasToWork().put(small_alias.toString(), tblScan_op2);
Path tblDir = smallTblDirs.get(small_alias);
localPlan.getAliasToFetchWork().put(small_alias.toString(), new FetchWork(tblDir, tableDescList.get(small_alias)));
}
newPlan.setMapRedLocalWork(localPlan);
// construct a map join and set it as the child operator of tblScan_op
MapJoinOperator mapJoinOp = (MapJoinOperator) OperatorFactory.getAndMakeChild(joinOp.getCompilationOpContext(), mapJoinDescriptor, (RowSchema) null, parentOps);
// change the children of the original join operator to point to the map
// join operator
List<Operator<? extends OperatorDesc>> childOps = cloneJoinOp.getChildOperators();
for (Operator<? extends OperatorDesc> childOp : childOps) {
childOp.replaceParent(cloneJoinOp, mapJoinOp);
}
mapJoinOp.setChildOperators(childOps);
HiveConf jc = new HiveConf(parseCtx.getConf(), GenMRSkewJoinProcessor.class);
newPlan.setNumMapTasks(HiveConf.getIntVar(jc, HiveConf.ConfVars.HIVESKEWJOINMAPJOINNUMMAPTASK));
newPlan.setMinSplitSize(HiveConf.getLongVar(jc, HiveConf.ConfVars.HIVESKEWJOINMAPJOINMINSPLIT));
newPlan.setInputformat(HiveInputFormat.class.getName());
MapredWork w = new MapredWork();
w.setMapWork(newPlan);
Task<? extends Serializable> skewJoinMapJoinTask = TaskFactory.get(w);
skewJoinMapJoinTask.setFetchSource(currTask.isFetchSource());
bigKeysDirToTaskMap.put(bigKeyDirPath, skewJoinMapJoinTask);
listWorks.add(skewJoinMapJoinTask.getWork());
listTasks.add(skewJoinMapJoinTask);
}
if (children != null) {
for (Task<? extends Serializable> tsk : listTasks) {
for (Task<? extends Serializable> oldChild : children) {
tsk.addDependentTask(oldChild);
}
}
currTask.setChildTasks(new ArrayList<Task<? extends Serializable>>());
for (Task<? extends Serializable> oldChild : children) {
oldChild.getParentTasks().remove(currTask);
}
listTasks.addAll(children);
}
ConditionalResolverSkewJoinCtx context = new ConditionalResolverSkewJoinCtx(bigKeysDirToTaskMap, children);
ConditionalWork cndWork = new ConditionalWork(listWorks);
ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork);
cndTsk.setListTasks(listTasks);
cndTsk.setResolver(new ConditionalResolverSkewJoin());
cndTsk.setResolverCtx(context);
currTask.setChildTasks(new ArrayList<Task<? extends Serializable>>());
currTask.addDependentTask(cndTsk);
return;
}
use of org.apache.hadoop.hive.serde2.typeinfo.TypeInfo in project hive by apache.
the class VectorPTFOperator method allocateOverflowBatchColumnVector.
/*
* Allocate overflow batch columns by hand.
*/
private void allocateOverflowBatchColumnVector(VectorizedRowBatch overflowBatch, int outputColumn, String typeName) throws HiveException {
if (overflowBatch.cols[outputColumn] == null) {
typeName = VectorizationContext.mapTypeNameSynonyms(typeName);
TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(typeName);
overflowBatch.cols[outputColumn] = VectorizedBatchUtil.createColumnVector(typeInfo);
}
}
Aggregations