use of org.apache.hadoop.hive.ql.metadata.HiveStorageHandler in project hive by apache.
the class HCatTable method storageHandler.
/**
* Setter for StorageHandler class.
*/
public HCatTable storageHandler(String storageHandler) throws HCatException {
this.tblProps.put(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_STORAGE, storageHandler);
LOG.warn("HiveStorageHandlers can't be reliably instantiated on the client-side. " + "Attempting to derive Input/OutputFormat settings from StorageHandler, on best effort: ");
try {
HiveStorageHandler sh = HiveUtils.getStorageHandler(getConf(), storageHandler);
this.sd.setInputFormat(sh.getInputFormatClass().getName());
this.sd.setOutputFormat(sh.getOutputFormatClass().getName());
this.sd.getSerdeInfo().setSerializationLib(sh.getSerDeClass().getName());
} catch (HiveException e) {
LOG.warn("Could not derive Input/OutputFormat and SerDe settings from storageHandler. " + "These values need to be set explicitly.", e);
}
return this;
}
use of org.apache.hadoop.hive.ql.metadata.HiveStorageHandler in project hive by apache.
the class Utilities method getInputSummary.
/**
* Calculate the total size of input files.
*
* @param ctx
* the hadoop job context
* @param work
* map reduce job plan
* @param filter
* filter to apply to the input paths before calculating size
* @return the summary of all the input paths.
* @throws IOException
*/
public static ContentSummary getInputSummary(final Context ctx, MapWork work, PathFilter filter) throws IOException {
PerfLogger perfLogger = SessionState.getPerfLogger();
perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.INPUT_SUMMARY);
long[] summary = { 0, 0, 0 };
final Set<Path> pathNeedProcess = new HashSet<>();
// this method will avoid number of threads out of control.
synchronized (INPUT_SUMMARY_LOCK) {
// For each input path, calculate the total size.
for (Path path : work.getPathToAliases().keySet()) {
Path p = path;
if (filter != null && !filter.accept(p)) {
continue;
}
ContentSummary cs = ctx.getCS(path);
if (cs == null) {
if (path == null) {
continue;
}
pathNeedProcess.add(path);
} else {
summary[0] += cs.getLength();
summary[1] += cs.getFileCount();
summary[2] += cs.getDirectoryCount();
}
}
// Process the case when name node call is needed
final Map<String, ContentSummary> resultMap = new ConcurrentHashMap<String, ContentSummary>();
ArrayList<Future<?>> results = new ArrayList<Future<?>>();
final ExecutorService executor;
int numExecutors = getMaxExecutorsForInputListing(ctx.getConf(), pathNeedProcess.size());
if (numExecutors > 1) {
LOG.info("Using " + numExecutors + " threads for getContentSummary");
executor = Executors.newFixedThreadPool(numExecutors, new ThreadFactoryBuilder().setDaemon(true).setNameFormat("Get-Input-Summary-%d").build());
} else {
executor = null;
}
HiveInterruptCallback interrup = HiveInterruptUtils.add(new HiveInterruptCallback() {
@Override
public void interrupt() {
for (Path path : pathNeedProcess) {
try {
path.getFileSystem(ctx.getConf()).close();
} catch (IOException ignore) {
LOG.debug("Failed to close filesystem", ignore);
}
}
if (executor != null) {
executor.shutdownNow();
}
}
});
try {
Configuration conf = ctx.getConf();
JobConf jobConf = new JobConf(conf);
for (Path path : pathNeedProcess) {
final Path p = path;
final String pathStr = path.toString();
// All threads share the same Configuration and JobConf based on the
// assumption that they are thread safe if only read operations are
// executed. It is not stated in Hadoop's javadoc, the sourcce codes
// clearly showed that they made efforts for it and we believe it is
// thread safe. Will revisit this piece of codes if we find the assumption
// is not correct.
final Configuration myConf = conf;
final JobConf myJobConf = jobConf;
final Map<String, Operator<?>> aliasToWork = work.getAliasToWork();
final Map<Path, ArrayList<String>> pathToAlias = work.getPathToAliases();
final PartitionDesc partDesc = work.getPathToPartitionInfo().get(p);
Runnable r = new Runnable() {
@Override
public void run() {
try {
Class<? extends InputFormat> inputFormatCls = partDesc.getInputFileFormatClass();
InputFormat inputFormatObj = HiveInputFormat.getInputFormatFromCache(inputFormatCls, myJobConf);
if (inputFormatObj instanceof ContentSummaryInputFormat) {
ContentSummaryInputFormat cs = (ContentSummaryInputFormat) inputFormatObj;
resultMap.put(pathStr, cs.getContentSummary(p, myJobConf));
return;
}
String metaTableStorage = null;
if (partDesc.getTableDesc() != null && partDesc.getTableDesc().getProperties() != null) {
metaTableStorage = partDesc.getTableDesc().getProperties().getProperty(hive_metastoreConstants.META_TABLE_STORAGE, null);
}
if (partDesc.getProperties() != null) {
metaTableStorage = partDesc.getProperties().getProperty(hive_metastoreConstants.META_TABLE_STORAGE, metaTableStorage);
}
HiveStorageHandler handler = HiveUtils.getStorageHandler(myConf, metaTableStorage);
if (handler instanceof InputEstimator) {
long total = 0;
TableDesc tableDesc = partDesc.getTableDesc();
InputEstimator estimator = (InputEstimator) handler;
for (String alias : HiveFileFormatUtils.doGetAliasesFromPath(pathToAlias, p)) {
JobConf jobConf = new JobConf(myJobConf);
TableScanOperator scanOp = (TableScanOperator) aliasToWork.get(alias);
Utilities.setColumnNameList(jobConf, scanOp, true);
Utilities.setColumnTypeList(jobConf, scanOp, true);
PlanUtils.configureInputJobPropertiesForStorageHandler(tableDesc);
Utilities.copyTableJobPropertiesToConf(tableDesc, jobConf);
total += estimator.estimate(jobConf, scanOp, -1).getTotalLength();
}
resultMap.put(pathStr, new ContentSummary(total, -1, -1));
} else {
// todo: should nullify summary for non-native tables,
// not to be selected as a mapjoin target
FileSystem fs = p.getFileSystem(myConf);
resultMap.put(pathStr, fs.getContentSummary(p));
}
} catch (Exception e) {
// We safely ignore this exception for summary data.
// We don't update the cache to protect it from polluting other
// usages. The worst case is that IOException will always be
// retried for another getInputSummary(), which is fine as
// IOException is not considered as a common case.
LOG.info("Cannot get size of " + pathStr + ". Safely ignored.");
}
}
};
if (executor == null) {
r.run();
} else {
Future<?> result = executor.submit(r);
results.add(result);
}
}
if (executor != null) {
for (Future<?> result : results) {
boolean executorDone = false;
do {
try {
result.get();
executorDone = true;
} catch (InterruptedException e) {
LOG.info("Interrupted when waiting threads: ", e);
Thread.currentThread().interrupt();
break;
} catch (ExecutionException e) {
throw new IOException(e);
}
} while (!executorDone);
}
executor.shutdown();
}
HiveInterruptUtils.checkInterrupted();
for (Map.Entry<String, ContentSummary> entry : resultMap.entrySet()) {
ContentSummary cs = entry.getValue();
summary[0] += cs.getLength();
summary[1] += cs.getFileCount();
summary[2] += cs.getDirectoryCount();
ctx.addCS(entry.getKey(), cs);
LOG.info("Cache Content Summary for " + entry.getKey() + " length: " + cs.getLength() + " file count: " + cs.getFileCount() + " directory count: " + cs.getDirectoryCount());
}
perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.INPUT_SUMMARY);
return new ContentSummary(summary[0], summary[1], summary[2]);
} finally {
HiveInterruptUtils.remove(interrup);
}
}
}
use of org.apache.hadoop.hive.ql.metadata.HiveStorageHandler in project hive by apache.
the class HCatUtil method getStorageHandler.
/**
* Create an instance of a storage handler. If storageHandler == null,
* then surrrogate StorageHandler is used to encapsulate the InputFormat, OutputFormat and SerDe.
* This StorageHandler assumes the other supplied storage artifacts are for a file-based storage system.
* @param conf job's configuration will be used to configure the Configurable StorageHandler
* @param storageHandler fully qualified class name of the desired StorageHandle instance
* @param serDe fully qualified class name of the desired SerDe instance
* @param inputFormat fully qualified class name of the desired InputFormat instance
* @param outputFormat fully qualified class name of the desired outputFormat instance
* @return storageHandler instance
* @throws IOException
*/
public static HiveStorageHandler getStorageHandler(Configuration conf, String storageHandler, String serDe, String inputFormat, String outputFormat) throws IOException {
if ((storageHandler == null) || (storageHandler.equals(FosterStorageHandler.class.getName()))) {
try {
FosterStorageHandler fosterStorageHandler = new FosterStorageHandler(inputFormat, outputFormat, serDe);
fosterStorageHandler.setConf(conf);
return fosterStorageHandler;
} catch (ClassNotFoundException e) {
throw new IOException("Failed to load " + "foster storage handler", e);
}
}
try {
Class<? extends HiveStorageHandler> handlerClass = (Class<? extends HiveStorageHandler>) Class.forName(storageHandler, true, Utilities.getSessionSpecifiedClassLoader());
return (HiveStorageHandler) ReflectionUtils.newInstance(handlerClass, conf);
} catch (ClassNotFoundException e) {
throw new IOException("Error in loading storage handler." + e.getMessage(), e);
}
}
use of org.apache.hadoop.hive.ql.metadata.HiveStorageHandler in project hive by apache.
the class FileOutputFormatContainer method getRecordWriter.
@Override
public RecordWriter<WritableComparable<?>, HCatRecord> getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException {
//this needs to be manually set, under normal circumstances MR Task does this
setWorkOutputPath(context);
//Configure the output key and value classes.
// This is required for writing null as key for file based tables.
context.getConfiguration().set("mapred.output.key.class", NullWritable.class.getName());
String jobInfoString = context.getConfiguration().get(HCatConstants.HCAT_KEY_OUTPUT_INFO);
OutputJobInfo jobInfo = (OutputJobInfo) HCatUtil.deserialize(jobInfoString);
StorerInfo storeInfo = jobInfo.getTableInfo().getStorerInfo();
HiveStorageHandler storageHandler = HCatUtil.getStorageHandler(context.getConfiguration(), storeInfo);
Class<? extends AbstractSerDe> serde = storageHandler.getSerDeClass();
AbstractSerDe sd = (AbstractSerDe) ReflectionUtils.newInstance(serde, context.getConfiguration());
context.getConfiguration().set("mapred.output.value.class", sd.getSerializedClass().getName());
RecordWriter<WritableComparable<?>, HCatRecord> rw;
if (HCatBaseOutputFormat.getJobInfo(context.getConfiguration()).isDynamicPartitioningUsed()) {
// When Dynamic partitioning is used, the RecordWriter instance initialized here isn't used. Can use null.
// (That's because records can't be written until the values of the dynamic partitions are deduced.
// By that time, a new local instance of RecordWriter, with the correct output-path, will be constructed.)
rw = new DynamicPartitionFileRecordWriterContainer((org.apache.hadoop.mapred.RecordWriter) null, context);
} else {
Path parentDir = new Path(context.getConfiguration().get("mapred.work.output.dir"));
Path childPath = new Path(parentDir, FileOutputFormat.getUniqueName(new JobConf(context.getConfiguration()), context.getConfiguration().get("mapreduce.output.basename", "part")));
rw = new StaticPartitionFileRecordWriterContainer(getBaseOutputFormat().getRecordWriter(parentDir.getFileSystem(context.getConfiguration()), new JobConf(context.getConfiguration()), childPath.toString(), InternalUtil.createReporter(context)), context);
}
return rw;
}
use of org.apache.hadoop.hive.ql.metadata.HiveStorageHandler in project hive by apache.
the class HCatBaseInputFormat method createRecordReader.
/**
* Create the RecordReader for the given InputSplit. Returns the underlying
* RecordReader if the required operations are supported and schema matches
* with HCatTable schema. Returns an HCatRecordReader if operations need to
* be implemented in HCat.
* @param split the split
* @param taskContext the task attempt context
* @return the record reader instance, either an HCatRecordReader(later) or
* the underlying storage handler's RecordReader
* @throws IOException or InterruptedException
*/
@Override
public RecordReader<WritableComparable, HCatRecord> createRecordReader(InputSplit split, TaskAttemptContext taskContext) throws IOException, InterruptedException {
HCatSplit hcatSplit = InternalUtil.castToHCatSplit(split);
PartInfo partitionInfo = hcatSplit.getPartitionInfo();
// Ensure PartInfo's TableInfo is initialized.
if (partitionInfo.getTableInfo() == null) {
partitionInfo.setTableInfo(((InputJobInfo) HCatUtil.deserialize(taskContext.getConfiguration().get(HCatConstants.HCAT_KEY_JOB_INFO))).getTableInfo());
}
JobContext jobContext = taskContext;
Configuration conf = jobContext.getConfiguration();
HiveStorageHandler storageHandler = HCatUtil.getStorageHandler(conf, partitionInfo);
JobConf jobConf = HCatUtil.getJobConfFromContext(jobContext);
Map<String, String> jobProperties = partitionInfo.getJobProperties();
HCatUtil.copyJobPropertiesToJobConf(jobProperties, jobConf);
Map<String, Object> valuesNotInDataCols = getColValsNotInDataColumns(getOutputSchema(conf), partitionInfo);
return new HCatRecordReader(storageHandler, valuesNotInDataCols);
}
Aggregations