use of org.apache.hadoop.hive.ql.metadata.HiveStorageHandler in project hive by apache.
the class PlanUtils method configureJobPropertiesForStorageHandler.
private static void configureJobPropertiesForStorageHandler(boolean input, TableDesc tableDesc) {
if (tableDesc == null) {
return;
}
try {
HiveStorageHandler storageHandler = HiveUtils.getStorageHandler(Hive.get().getConf(), tableDesc.getProperties().getProperty(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_STORAGE));
if (storageHandler != null) {
Map<String, String> jobProperties = new LinkedHashMap<String, String>();
Map<String, String> jobSecrets = new LinkedHashMap<String, String>();
if (input) {
try {
storageHandler.configureInputJobProperties(tableDesc, jobProperties);
} catch (AbstractMethodError e) {
LOG.info("configureInputJobProperties not found " + "using configureTableJobProperties", e);
storageHandler.configureTableJobProperties(tableDesc, jobProperties);
}
try {
storageHandler.configureInputJobCredentials(tableDesc, jobSecrets);
} catch (AbstractMethodError e) {
// ignore
LOG.info("configureInputJobSecrets not found");
}
} else {
try {
storageHandler.configureOutputJobProperties(tableDesc, jobProperties);
} catch (AbstractMethodError e) {
LOG.info("configureOutputJobProperties not found" + "using configureTableJobProperties", e);
storageHandler.configureTableJobProperties(tableDesc, jobProperties);
}
}
// plans.
if (!jobProperties.isEmpty()) {
tableDesc.setJobProperties(jobProperties);
}
// same idea, only set for non-native tables
if (!jobSecrets.isEmpty()) {
tableDesc.setJobSecrets(jobSecrets);
}
}
} catch (HiveException ex) {
throw new RuntimeException(ex);
}
}
use of org.apache.hadoop.hive.ql.metadata.HiveStorageHandler in project hive by apache.
the class Utilities method getInputSummaryWithPool.
@VisibleForTesting
static ContentSummary getInputSummaryWithPool(final Context ctx, Set<Path> pathNeedProcess, MapWork work, long[] summary, ExecutorService executor) throws IOException {
List<Future<?>> results = new ArrayList<Future<?>>();
final Map<String, ContentSummary> resultMap = new ConcurrentHashMap<String, ContentSummary>();
HiveInterruptCallback interrup = HiveInterruptUtils.add(new HiveInterruptCallback() {
@Override
public void interrupt() {
for (Path path : pathNeedProcess) {
try {
path.getFileSystem(ctx.getConf()).close();
} catch (IOException ignore) {
LOG.debug("Failed to close filesystem", ignore);
}
}
if (executor != null) {
executor.shutdownNow();
}
}
});
try {
Configuration conf = ctx.getConf();
JobConf jobConf = new JobConf(conf);
for (Path path : pathNeedProcess) {
final Path p = path;
final String pathStr = path.toString();
// All threads share the same Configuration and JobConf based on the
// assumption that they are thread safe if only read operations are
// executed. It is not stated in Hadoop's javadoc, the sourcce codes
// clearly showed that they made efforts for it and we believe it is
// thread safe. Will revisit this piece of codes if we find the assumption
// is not correct.
final Configuration myConf = conf;
final JobConf myJobConf = jobConf;
final Map<String, Operator<?>> aliasToWork = work.getAliasToWork();
final Map<Path, ArrayList<String>> pathToAlias = work.getPathToAliases();
final PartitionDesc partDesc = work.getPathToPartitionInfo().get(p);
Runnable r = new Runnable() {
@Override
public void run() {
try {
Class<? extends InputFormat> inputFormatCls = partDesc.getInputFileFormatClass();
InputFormat inputFormatObj = HiveInputFormat.getInputFormatFromCache(inputFormatCls, myJobConf);
if (inputFormatObj instanceof ContentSummaryInputFormat) {
ContentSummaryInputFormat cs = (ContentSummaryInputFormat) inputFormatObj;
resultMap.put(pathStr, cs.getContentSummary(p, myJobConf));
return;
}
String metaTableStorage = null;
if (partDesc.getTableDesc() != null && partDesc.getTableDesc().getProperties() != null) {
metaTableStorage = partDesc.getTableDesc().getProperties().getProperty(hive_metastoreConstants.META_TABLE_STORAGE, null);
}
if (partDesc.getProperties() != null) {
metaTableStorage = partDesc.getProperties().getProperty(hive_metastoreConstants.META_TABLE_STORAGE, metaTableStorage);
}
HiveStorageHandler handler = HiveUtils.getStorageHandler(myConf, metaTableStorage);
if (handler instanceof InputEstimator) {
long total = 0;
TableDesc tableDesc = partDesc.getTableDesc();
InputEstimator estimator = (InputEstimator) handler;
for (String alias : HiveFileFormatUtils.doGetAliasesFromPath(pathToAlias, p)) {
JobConf jobConf = new JobConf(myJobConf);
TableScanOperator scanOp = (TableScanOperator) aliasToWork.get(alias);
Utilities.setColumnNameList(jobConf, scanOp, true);
Utilities.setColumnTypeList(jobConf, scanOp, true);
PlanUtils.configureInputJobPropertiesForStorageHandler(tableDesc);
Utilities.copyTableJobPropertiesToConf(tableDesc, jobConf);
total += estimator.estimate(jobConf, scanOp, -1).getTotalLength();
}
resultMap.put(pathStr, new ContentSummary(total, -1, -1));
} else {
// todo: should nullify summary for non-native tables,
// not to be selected as a mapjoin target
FileSystem fs = p.getFileSystem(myConf);
resultMap.put(pathStr, fs.getContentSummary(p));
}
} catch (Exception e) {
// We safely ignore this exception for summary data.
// We don't update the cache to protect it from polluting other
// usages. The worst case is that IOException will always be
// retried for another getInputSummary(), which is fine as
// IOException is not considered as a common case.
LOG.info("Cannot get size of {}. Safely ignored.", pathStr);
}
}
};
if (executor == null) {
r.run();
} else {
Future<?> result = executor.submit(r);
results.add(result);
}
}
if (executor != null) {
for (Future<?> result : results) {
boolean executorDone = false;
do {
try {
result.get();
executorDone = true;
} catch (InterruptedException e) {
LOG.info("Interrupted when waiting threads: ", e);
Thread.currentThread().interrupt();
break;
} catch (ExecutionException e) {
throw new IOException(e);
}
} while (!executorDone);
}
executor.shutdown();
}
HiveInterruptUtils.checkInterrupted();
for (Map.Entry<String, ContentSummary> entry : resultMap.entrySet()) {
ContentSummary cs = entry.getValue();
summary[0] += cs.getLength();
summary[1] += cs.getFileCount();
summary[2] += cs.getDirectoryCount();
ctx.addCS(entry.getKey(), cs);
if (LOG.isInfoEnabled()) {
LOG.info("Cache Content Summary for {} length: {} file count: {} " + " directory count: {}", entry.getKey(), cs.getLength(), cs.getFileCount(), cs.getDirectoryCount());
}
}
return new ContentSummary(summary[0], summary[1], summary[2]);
} finally {
if (executor != null) {
executor.shutdownNow();
}
HiveInterruptUtils.remove(interrup);
}
}
use of org.apache.hadoop.hive.ql.metadata.HiveStorageHandler in project hive by apache.
the class HCatBaseOutputFormat method configureOutputStorageHandler.
/**
* Configure the output storage handler with allowing specification of missing dynamic partvals
* @param jobContext the job context
* @param dynamicPartVals
* @throws IOException
*/
@SuppressWarnings("unchecked")
static void configureOutputStorageHandler(JobContext jobContext, List<String> dynamicPartVals) throws IOException {
Configuration conf = jobContext.getConfiguration();
try {
OutputJobInfo jobInfo = (OutputJobInfo) HCatUtil.deserialize(conf.get(HCatConstants.HCAT_KEY_OUTPUT_INFO));
HiveStorageHandler storageHandler = HCatUtil.getStorageHandler(jobContext.getConfiguration(), jobInfo.getTableInfo().getStorerInfo());
Map<String, String> partitionValues = jobInfo.getPartitionValues();
String location = jobInfo.getLocation();
if (dynamicPartVals != null) {
// dynamic part vals specified
List<String> dynamicPartKeys = jobInfo.getDynamicPartitioningKeys();
if (dynamicPartVals.size() != dynamicPartKeys.size()) {
throw new HCatException(ErrorType.ERROR_INVALID_PARTITION_VALUES, "Unable to configure dynamic partitioning for storage handler, mismatch between" + " number of partition values obtained[" + dynamicPartVals.size() + "] and number of partition values required[" + dynamicPartKeys.size() + "]");
}
for (int i = 0; i < dynamicPartKeys.size(); i++) {
partitionValues.put(dynamicPartKeys.get(i), dynamicPartVals.get(i));
}
// // re-home location, now that we know the rest of the partvals
// Table table = jobInfo.getTableInfo().getTable();
//
// List<String> partitionCols = new ArrayList<String>();
// for(FieldSchema schema : table.getPartitionKeys()) {
// partitionCols.add(schema.getName());
// }
jobInfo.setPartitionValues(partitionValues);
}
HCatUtil.configureOutputStorageHandler(storageHandler, conf, jobInfo);
} catch (Exception e) {
if (e instanceof HCatException) {
throw (HCatException) e;
} else {
throw new HCatException(ErrorType.ERROR_INIT_STORAGE_HANDLER, e);
}
}
}
use of org.apache.hadoop.hive.ql.metadata.HiveStorageHandler in project hive by apache.
the class CreateTableDesc method toTable.
public Table toTable(HiveConf conf) throws HiveException {
String databaseName = getDatabaseName();
String tableName = getTableName();
if (databaseName == null || tableName.contains(".")) {
String[] names = Utilities.getDbTableName(tableName);
databaseName = names[0];
tableName = names[1];
}
Table tbl = new Table(databaseName, tableName);
if (getTblProps() != null) {
tbl.getTTable().getParameters().putAll(getTblProps());
}
if (getPartCols() != null) {
tbl.setPartCols(getPartCols());
}
if (getNumBuckets() != -1) {
tbl.setNumBuckets(getNumBuckets());
}
if (getStorageHandler() != null) {
tbl.setProperty(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_STORAGE, getStorageHandler());
}
HiveStorageHandler storageHandler = tbl.getStorageHandler();
/*
* If the user didn't specify a SerDe, we use the default.
*/
String serDeClassName;
if (getSerName() == null) {
if (storageHandler == null) {
serDeClassName = PlanUtils.getDefaultSerDe().getName();
LOG.info("Default to " + serDeClassName + " for table " + tableName);
} else {
serDeClassName = storageHandler.getSerDeClass().getName();
LOG.info("Use StorageHandler-supplied " + serDeClassName + " for table " + tableName);
}
} else {
// let's validate that the serde exists
serDeClassName = getSerName();
DDLTask.validateSerDe(serDeClassName, conf);
}
tbl.setSerializationLib(serDeClassName);
if (getFieldDelim() != null) {
tbl.setSerdeParam(serdeConstants.FIELD_DELIM, getFieldDelim());
tbl.setSerdeParam(serdeConstants.SERIALIZATION_FORMAT, getFieldDelim());
}
if (getFieldEscape() != null) {
tbl.setSerdeParam(serdeConstants.ESCAPE_CHAR, getFieldEscape());
}
if (getCollItemDelim() != null) {
tbl.setSerdeParam(serdeConstants.COLLECTION_DELIM, getCollItemDelim());
}
if (getMapKeyDelim() != null) {
tbl.setSerdeParam(serdeConstants.MAPKEY_DELIM, getMapKeyDelim());
}
if (getLineDelim() != null) {
tbl.setSerdeParam(serdeConstants.LINE_DELIM, getLineDelim());
}
if (getNullFormat() != null) {
tbl.setSerdeParam(serdeConstants.SERIALIZATION_NULL_FORMAT, getNullFormat());
}
if (getSerdeProps() != null) {
Iterator<Map.Entry<String, String>> iter = getSerdeProps().entrySet().iterator();
while (iter.hasNext()) {
Map.Entry<String, String> m = iter.next();
tbl.setSerdeParam(m.getKey(), m.getValue());
}
}
if (getCols() != null) {
tbl.setFields(getCols());
}
if (getBucketCols() != null) {
tbl.setBucketCols(getBucketCols());
}
if (getSortCols() != null) {
tbl.setSortCols(getSortCols());
}
if (getComment() != null) {
tbl.setProperty("comment", getComment());
}
if (getLocation() != null) {
tbl.setDataLocation(new Path(getLocation()));
}
if (getSkewedColNames() != null) {
tbl.setSkewedColNames(getSkewedColNames());
}
if (getSkewedColValues() != null) {
tbl.setSkewedColValues(getSkewedColValues());
}
tbl.getTTable().setTemporary(isTemporary());
tbl.setStoredAsSubDirectories(isStoredAsSubDirectories());
tbl.setInputFormatClass(getInputFormat());
tbl.setOutputFormatClass(getOutputFormat());
// Otherwise, load lazily via StorageHandler at query time.
if (getInputFormat() != null && !getInputFormat().isEmpty()) {
tbl.getTTable().getSd().setInputFormat(tbl.getInputFormatClass().getName());
}
if (getOutputFormat() != null && !getOutputFormat().isEmpty()) {
tbl.getTTable().getSd().setOutputFormat(tbl.getOutputFormatClass().getName());
}
if (DDLTask.doesTableNeedLocation(tbl)) {
// If location is specified - ensure that it is a full qualified name
DDLTask.makeLocationQualified(tbl.getDbName(), tbl.getTTable().getSd(), tableName, conf);
}
if (isExternal()) {
tbl.setProperty("EXTERNAL", "TRUE");
tbl.setTableType(TableType.EXTERNAL_TABLE);
}
// 'n' columns where 'n' is the length of the bucketed columns.
if ((tbl.getBucketCols() != null) && (tbl.getSortCols() != null)) {
List<String> bucketCols = tbl.getBucketCols();
List<Order> sortCols = tbl.getSortCols();
if ((sortCols.size() > 0) && (sortCols.size() >= bucketCols.size())) {
boolean found = true;
Iterator<String> iterBucketCols = bucketCols.iterator();
while (iterBucketCols.hasNext()) {
String bucketCol = iterBucketCols.next();
boolean colFound = false;
for (int i = 0; i < bucketCols.size(); i++) {
if (bucketCol.equals(sortCols.get(i).getCol())) {
colFound = true;
break;
}
}
if (colFound == false) {
found = false;
break;
}
}
if (found) {
tbl.setProperty("SORTBUCKETCOLSPREFIX", "TRUE");
}
}
}
if (!this.isCTAS && (tbl.getPath() == null || (tbl.isEmpty() && !isExternal()))) {
if (!tbl.isPartitioned() && conf.getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) {
StatsSetupConst.setStatsStateForCreateTable(tbl.getTTable().getParameters(), MetaStoreUtils.getColumnNames(tbl.getCols()), StatsSetupConst.TRUE);
}
} else {
StatsSetupConst.setStatsStateForCreateTable(tbl.getTTable().getParameters(), null, StatsSetupConst.FALSE);
}
return tbl;
}
use of org.apache.hadoop.hive.ql.metadata.HiveStorageHandler in project hive by apache.
the class CreateViewDesc method toTable.
public Table toTable(HiveConf conf) throws HiveException {
String[] names = Utilities.getDbTableName(getViewName());
String databaseName = names[0];
String tableName = names[1];
Table tbl = new Table(databaseName, tableName);
tbl.setViewOriginalText(getViewOriginalText());
tbl.setViewExpandedText(getViewExpandedText());
if (isMaterialized()) {
tbl.setRewriteEnabled(isRewriteEnabled());
tbl.setTableType(TableType.MATERIALIZED_VIEW);
} else {
tbl.setTableType(TableType.VIRTUAL_VIEW);
}
tbl.setSerializationLib(null);
tbl.clearSerDeInfo();
tbl.setFields(getSchema());
if (getComment() != null) {
tbl.setProperty("comment", getComment());
}
if (getTblProps() != null) {
tbl.getTTable().getParameters().putAll(getTblProps());
}
if (getPartCols() != null) {
tbl.setPartCols(getPartCols());
}
if (getInputFormat() != null) {
tbl.setInputFormatClass(getInputFormat());
}
if (getOutputFormat() != null) {
tbl.setOutputFormatClass(getOutputFormat());
}
if (isMaterialized()) {
if (getLocation() != null) {
tbl.setDataLocation(new Path(getLocation()));
}
if (getStorageHandler() != null) {
tbl.setProperty(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_STORAGE, getStorageHandler());
}
HiveStorageHandler storageHandler = tbl.getStorageHandler();
/*
* If the user didn't specify a SerDe, we use the default.
*/
String serDeClassName;
if (getSerde() == null) {
if (storageHandler == null) {
serDeClassName = PlanUtils.getDefaultSerDe().getName();
LOG.info("Default to {} for materialized view {}", serDeClassName, getViewName());
} else {
serDeClassName = storageHandler.getSerDeClass().getName();
LOG.info("Use StorageHandler-supplied {} for materialized view {}", serDeClassName, getViewName());
}
} else {
// let's validate that the serde exists
serDeClassName = getSerde();
DDLTask.validateSerDe(serDeClassName, conf);
}
tbl.setSerializationLib(serDeClassName);
// To remain consistent, we need to set input and output formats both
// at the table level and the storage handler level.
tbl.setInputFormatClass(getInputFormat());
tbl.setOutputFormatClass(getOutputFormat());
if (getInputFormat() != null && !getInputFormat().isEmpty()) {
tbl.getSd().setInputFormat(tbl.getInputFormatClass().getName());
}
if (getOutputFormat() != null && !getOutputFormat().isEmpty()) {
tbl.getSd().setOutputFormat(tbl.getOutputFormatClass().getName());
}
}
return tbl;
}
Aggregations