use of org.apache.hadoop.hive.ql.io.HiveInputFormat in project hive by apache.
the class DagUtils method createVertexFromMapWork.
/*
* Helper function to create Vertex from MapWork.
*/
private Vertex createVertexFromMapWork(JobConf conf, MapWork mapWork, Path mrScratchDir, VertexType vertexType) throws Exception {
// set up the operator plan
Utilities.cacheMapWork(conf, mapWork, mrScratchDir);
// create the directories FileSinkOperators need
Utilities.createTmpDirs(conf, mapWork);
// finally create the vertex
Vertex map = null;
// use tez to combine splits
boolean groupSplitsInInputInitializer;
DataSourceDescriptor dataSource;
int numTasks = -1;
@SuppressWarnings("rawtypes") Class inputFormatClass = conf.getClass("mapred.input.format.class", InputFormat.class);
boolean vertexHasCustomInput = VertexType.isCustomInputType(vertexType);
LOG.info("Vertex has custom input? " + vertexHasCustomInput);
if (vertexHasCustomInput) {
groupSplitsInInputInitializer = false;
// it will be enabled in the CustomVertex.
if (inputFormatClass != BucketizedHiveInputFormat.class && inputFormatClass != HiveInputFormat.class) {
// As of now only these two formats are supported.
inputFormatClass = HiveInputFormat.class;
}
conf.setClass("mapred.input.format.class", inputFormatClass, InputFormat.class);
// mapreduce.tez.input.initializer.serialize.event.payload should be set to false when using
// this plug-in to avoid getting a serialized event at run-time.
conf.setBoolean("mapreduce.tez.input.initializer.serialize.event.payload", false);
} else {
// is HiveInputFormat
if (inputFormatClass == HiveInputFormat.class) {
groupSplitsInInputInitializer = true;
} else {
groupSplitsInInputInitializer = false;
}
}
if (mapWork instanceof MergeFileWork) {
Path outputPath = ((MergeFileWork) mapWork).getOutputDir();
// prepare the tmp output directory. The output tmp directory should
// exist before jobClose (before renaming after job completion)
Path tempOutPath = Utilities.toTempPath(outputPath);
try {
FileSystem tmpOutFS = tempOutPath.getFileSystem(conf);
if (!tmpOutFS.exists(tempOutPath)) {
tmpOutFS.mkdirs(tempOutPath);
}
} catch (IOException e) {
throw new RuntimeException("Can't make path " + outputPath + " : " + e.getMessage(), e);
}
}
// remember mapping of plan to input
conf.set(Utilities.INPUT_NAME, mapWork.getName());
if (HiveConf.getBoolVar(conf, ConfVars.HIVE_AM_SPLIT_GENERATION)) {
// set up the operator plan. (before setting up splits on the AM)
Utilities.setMapWork(conf, mapWork, mrScratchDir, false);
// the correct plugin.
if (groupSplitsInInputInitializer) {
// Not setting a payload, since the MRInput payload is the same and can be accessed.
InputInitializerDescriptor descriptor = InputInitializerDescriptor.create(HiveSplitGenerator.class.getName());
dataSource = MRInputLegacy.createConfigBuilder(conf, inputFormatClass).groupSplits(true).setCustomInitializerDescriptor(descriptor).build();
} else {
// Not HiveInputFormat, or a custom VertexManager will take care of grouping splits
if (vertexHasCustomInput && vertexType == VertexType.MULTI_INPUT_UNINITIALIZED_EDGES) {
// SMB Join.
dataSource = MultiMRInput.createConfigBuilder(conf, inputFormatClass).groupSplits(false).build();
} else {
dataSource = MRInputLegacy.createConfigBuilder(conf, inputFormatClass).groupSplits(false).build();
}
}
} else {
// Setup client side split generation.
// we need to set this, because with HS2 and client side split
// generation we end up not finding the map work. This is
// because of thread local madness (tez split generation is
// multi-threaded - HS2 plan cache uses thread locals). Setting
// VECTOR_MODE/USE_VECTORIZED_INPUT_FILE_FORMAT causes the split gen code to use the conf instead
// of the map work.
conf.setBoolean(Utilities.VECTOR_MODE, mapWork.getVectorMode());
conf.setBoolean(Utilities.USE_VECTORIZED_INPUT_FILE_FORMAT, mapWork.getUseVectorizedInputFileFormat());
InputSplitInfo inputSplitInfo = MRInputHelpers.generateInputSplitsToMem(conf, false, 0);
InputInitializerDescriptor descriptor = InputInitializerDescriptor.create(MRInputSplitDistributor.class.getName());
InputDescriptor inputDescriptor = InputDescriptor.create(MRInputLegacy.class.getName()).setUserPayload(UserPayload.create(MRRuntimeProtos.MRInputUserPayloadProto.newBuilder().setConfigurationBytes(TezUtils.createByteStringFromConf(conf)).setSplits(inputSplitInfo.getSplitsProto()).build().toByteString().asReadOnlyByteBuffer()));
dataSource = DataSourceDescriptor.create(inputDescriptor, descriptor, null);
numTasks = inputSplitInfo.getNumTasks();
// set up the operator plan. (after generating splits - that changes configs)
Utilities.setMapWork(conf, mapWork, mrScratchDir, false);
}
UserPayload serializedConf = TezUtils.createUserPayloadFromConf(conf);
String procClassName = MapTezProcessor.class.getName();
if (mapWork instanceof MergeFileWork) {
procClassName = MergeFileTezProcessor.class.getName();
}
map = Vertex.create(mapWork.getName(), ProcessorDescriptor.create(procClassName).setUserPayload(serializedConf), numTasks, getContainerResource(conf));
map.setTaskEnvironment(getContainerEnvironment(conf, true));
assert mapWork.getAliasToWork().keySet().size() == 1;
// Add the actual source input
String alias = mapWork.getAliasToWork().keySet().iterator().next();
map.addDataSource(alias, dataSource);
return map;
}
use of org.apache.hadoop.hive.ql.io.HiveInputFormat in project hive by apache.
the class HiveSplitGenerator method initialize.
@SuppressWarnings("unchecked")
@Override
public List<Event> initialize() throws Exception {
if (getContext() != null) {
// called from Tez AM.
prepare(getContext());
}
// Setup the map work for this thread. Pruning modified the work instance to potentially remove
// partitions. The same work instance must be used when generating splits.
Utilities.setMapWork(jobConf, work);
try {
boolean sendSerializedEvents = conf.getBoolean("mapreduce.tez.input.initializer.serialize.event.payload", true);
// perform dynamic partition pruning
if (pruner != null) {
pruner.initialize(getContext(), work, jobConf);
pruner.prune();
}
InputSplitInfoMem inputSplitInfo = null;
boolean generateConsistentSplits = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_TEZ_GENERATE_CONSISTENT_SPLITS);
LOG.info("GenerateConsistentSplitsInHive=" + generateConsistentSplits);
String realInputFormatName = conf.get("mapred.input.format.class");
boolean groupingEnabled = userPayloadProto.getGroupingEnabled();
if (groupingEnabled) {
// Need to instantiate the realInputFormat
InputFormat<?, ?> inputFormat = (InputFormat<?, ?>) ReflectionUtils.newInstance(JavaUtils.loadClass(realInputFormatName), jobConf);
int totalResource = 0;
int taskResource = 0;
int availableSlots = 0;
// FIXME. Do the right thing Luke.
if (getContext() == null) {
// for now, totalResource = taskResource for llap
availableSlots = 1;
}
if (getContext() != null) {
totalResource = getContext().getTotalAvailableResource().getMemory();
taskResource = getContext().getVertexTaskResource().getMemory();
availableSlots = totalResource / taskResource;
}
if (HiveConf.getLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, 1) <= 1) {
// broken configuration from mapred-default.xml
final long blockSize = conf.getLongBytes(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, DFSConfigKeys.DFS_BLOCK_SIZE_DEFAULT);
final long minGrouping = conf.getLong(TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_MIN_SIZE, TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_MIN_SIZE_DEFAULT);
final long preferredSplitSize = Math.min(blockSize / 2, minGrouping);
HiveConf.setLongVar(jobConf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, preferredSplitSize);
LOG.info("The preferred split size is " + preferredSplitSize);
}
float waves;
// Create the un-grouped splits
if (numSplits.isPresent()) {
waves = numSplits.get().floatValue() / availableSlots;
} else {
waves = conf.getFloat(TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_WAVES, TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_WAVES_DEFAULT);
}
InputSplit[] splits;
if (generateSingleSplit && conf.get(HiveConf.ConfVars.HIVETEZINPUTFORMAT.varname).equals(HiveInputFormat.class.getName())) {
MapWork mapWork = Utilities.getMapWork(jobConf);
List<Path> paths = Utilities.getInputPathsTez(jobConf, mapWork);
FileSystem fs = paths.get(0).getFileSystem(jobConf);
FileStatus[] fileStatuses = fs.listStatus(paths.get(0));
if (fileStatuses.length == 0) {
// generate single split typically happens when reading data out of order by queries.
// if order by query returns no rows, no files will exists in input path
splits = new InputSplit[0];
} else {
// if files exists in input path then it has to be 1 as this code path gets triggered only
// of order by queries which is expected to write only one file (written by one reducer)
Preconditions.checkState(paths.size() == 1 && fileStatuses.length == 1 && mapWork.getAliasToPartnInfo().size() == 1, "Requested to generate single split. Paths and fileStatuses are expected to be 1. " + "Got paths: " + paths.size() + " fileStatuses: " + fileStatuses.length);
splits = new InputSplit[1];
FileStatus fileStatus = fileStatuses[0];
BlockLocation[] locations = fs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen());
Set<String> hostsSet = new HashSet<>();
for (BlockLocation location : locations) {
hostsSet.addAll(Lists.newArrayList(location.getHosts()));
}
String[] hosts = hostsSet.toArray(new String[0]);
FileSplit fileSplit = new FileSplit(fileStatus.getPath(), 0, fileStatus.getLen(), hosts);
String alias = mapWork.getAliases().get(0);
PartitionDesc partDesc = mapWork.getAliasToPartnInfo().get(alias);
String partIF = partDesc.getInputFileFormatClassName();
splits[0] = new HiveInputFormat.HiveInputSplit(fileSplit, partIF);
}
} else {
// Raw splits
splits = inputFormat.getSplits(jobConf, numSplits.orElse(Math.multiplyExact(availableSlots, (int) waves)));
}
// Sort the splits, so that subsequent grouping is consistent.
Arrays.sort(splits, new InputSplitComparator());
LOG.info("Number of input splits: " + splits.length + ". " + availableSlots + " available slots, " + waves + " waves. Input format is: " + realInputFormatName);
// increment/set input counters
InputInitializerContext inputInitializerContext = getContext();
TezCounters tezCounters = null;
String counterName;
String groupName = null;
String vertexName = null;
if (inputInitializerContext != null) {
try {
tezCounters = new TezCounters();
groupName = HiveInputCounters.class.getName();
vertexName = jobConf.get(Operator.CONTEXT_NAME_KEY, "");
counterName = Utilities.getVertexCounterName(HiveInputCounters.RAW_INPUT_SPLITS.name(), vertexName);
tezCounters.findCounter(groupName, counterName).increment(splits.length);
final List<Path> paths = Utilities.getInputPathsTez(jobConf, work);
counterName = Utilities.getVertexCounterName(HiveInputCounters.INPUT_DIRECTORIES.name(), vertexName);
tezCounters.findCounter(groupName, counterName).increment(paths.size());
final Set<String> files = new HashSet<>();
for (InputSplit inputSplit : splits) {
if (inputSplit instanceof FileSplit) {
final FileSplit fileSplit = (FileSplit) inputSplit;
final Path path = fileSplit.getPath();
// The assumption here is the path is a file. Only case this is different is ACID deltas.
// The isFile check is avoided here for performance reasons.
final String fileStr = path.toString();
if (!files.contains(fileStr)) {
files.add(fileStr);
}
}
}
counterName = Utilities.getVertexCounterName(HiveInputCounters.INPUT_FILES.name(), vertexName);
tezCounters.findCounter(groupName, counterName).increment(files.size());
} catch (Exception e) {
LOG.warn("Caught exception while trying to update Tez counters", e);
}
}
if (work.getIncludedBuckets() != null) {
splits = pruneBuckets(work, splits);
}
Multimap<Integer, InputSplit> groupedSplits = splitGrouper.generateGroupedSplits(jobConf, conf, splits, waves, availableSlots, splitLocationProvider);
// And finally return them in a flat array
InputSplit[] flatSplits = groupedSplits.values().toArray(new InputSplit[0]);
LOG.info("Number of split groups: " + flatSplits.length);
if (inputInitializerContext != null) {
try {
counterName = Utilities.getVertexCounterName(HiveInputCounters.GROUPED_INPUT_SPLITS.name(), vertexName);
tezCounters.findCounter(groupName, counterName).setValue(flatSplits.length);
LOG.debug("Published tez counters: {}", tezCounters);
inputInitializerContext.addCounters(tezCounters);
} catch (Exception e) {
LOG.warn("Caught exception while trying to update Tez counters", e);
}
}
List<TaskLocationHint> locationHints = splitGrouper.createTaskLocationHints(flatSplits, generateConsistentSplits);
inputSplitInfo = new InputSplitInfoMem(flatSplits, locationHints, flatSplits.length, null, jobConf);
} else {
// If this is used in the future - make sure to disable grouping in the payload, if it isn't already disabled
throw new RuntimeException("HiveInputFormat does not support non-grouped splits, InputFormatName is: " + realInputFormatName);
// inputSplitInfo = MRInputHelpers.generateInputSplitsToMem(jobConf, false, 0);
}
return createEventList(sendSerializedEvents, inputSplitInfo);
} finally {
Utilities.clearWork(jobConf);
}
}
use of org.apache.hadoop.hive.ql.io.HiveInputFormat in project hive by apache.
the class TestCompactor method testStatsAfterCompactionPartTbl.
/**
* After each major compaction, stats need to be updated on the table
* 1. create a partitioned ORC backed table (Orc is currently required by ACID)
* 2. populate with data
* 3. compute stats
* 4. Trigger major compaction on one of the partitions (which should update stats)
* 5. check that stats have been updated for that partition only
*
* @throws Exception todo:
* 4. add a test with sorted table?
*/
@Test
public void testStatsAfterCompactionPartTbl() throws Exception {
// as of (8/27/2014) Hive 0.14, ACID/Orc requires HiveInputFormat
String dbName = "default";
String tblName = "compaction_test";
executeStatementOnDriver("drop table if exists " + tblName, driver);
executeStatementOnDriver("CREATE TABLE " + tblName + "(a INT, b STRING) " + " PARTITIONED BY(bkt INT)" + // currently ACID requires table to be bucketed
" CLUSTERED BY(a) INTO 4 BUCKETS" + " STORED AS ORC TBLPROPERTIES ('transactional'='true')", driver);
StrictDelimitedInputWriter writer = StrictDelimitedInputWriter.newBuilder().withFieldDelimiter(',').build();
HiveStreamingConnection connection = HiveStreamingConnection.newBuilder().withDatabase(dbName).withTable(tblName).withStaticPartitionValues(Arrays.asList("0")).withAgentInfo("UT_" + Thread.currentThread().getName()).withHiveConf(conf).withRecordWriter(writer).connect();
connection.beginTransaction();
connection.write("55, 'London'".getBytes());
connection.commitTransaction();
connection.beginTransaction();
connection.write("56, 'Paris'".getBytes());
connection.commitTransaction();
connection.close();
executeStatementOnDriver("INSERT INTO TABLE " + tblName + " PARTITION(bkt=1)" + " values(57, 'Budapest')", driver);
executeStatementOnDriver("INSERT INTO TABLE " + tblName + " PARTITION(bkt=1)" + " values(58, 'Milano')", driver);
execSelectAndDumpData("select * from " + tblName, driver, "Dumping data for " + tblName + " after load:");
TxnStore txnHandler = TxnUtils.getTxnStore(conf);
Table table = msClient.getTable(dbName, tblName);
// compute stats before compaction
CompactionInfo ci = new CompactionInfo(dbName, tblName, "bkt=0", CompactionType.MAJOR);
Worker.StatsUpdater.gatherStats(ci, conf, System.getProperty("user.name"), CompactorUtil.getCompactorJobQueueName(conf, ci, table));
ci = new CompactionInfo(dbName, tblName, "bkt=1", CompactionType.MAJOR);
Worker.StatsUpdater.gatherStats(ci, conf, System.getProperty("user.name"), CompactorUtil.getCompactorJobQueueName(conf, ci, table));
// Check basic stats are collected
org.apache.hadoop.hive.ql.metadata.Table hiveTable = Hive.get().getTable(tblName);
List<org.apache.hadoop.hive.ql.metadata.Partition> partitions = Hive.get().getPartitions(hiveTable);
Map<String, String> parameters = partitions.stream().filter(p -> p.getName().equals("bkt=0")).findFirst().orElseThrow(() -> new RuntimeException("Could not get Partition")).getParameters();
Assert.assertEquals("The number of files is differing from the expected", "2", parameters.get("numFiles"));
Assert.assertEquals("The number of rows is differing from the expected", "2", parameters.get("numRows"));
Assert.assertEquals("The total table size is differing from the expected", "1373", parameters.get("totalSize"));
parameters = partitions.stream().filter(p -> p.getName().equals("bkt=1")).findFirst().orElseThrow(() -> new RuntimeException("Could not get Partition")).getParameters();
Assert.assertEquals("The number of files is differing from the expected", "2", parameters.get("numFiles"));
Assert.assertEquals("The number of rows is differing from the expected", "2", parameters.get("numRows"));
Assert.assertEquals("The total table size is differing from the expected", "1442", parameters.get("totalSize"));
// Do a major compaction
CompactionRequest rqst = new CompactionRequest(dbName, tblName, CompactionType.MAJOR);
rqst.setPartitionname("bkt=0");
txnHandler.compact(rqst);
runWorker(conf);
ShowCompactResponse rsp = txnHandler.showCompact(new ShowCompactRequest());
List<ShowCompactResponseElement> compacts = rsp.getCompacts();
if (1 != compacts.size()) {
Assert.fail("Expecting 1 file and found " + compacts.size() + " files " + compacts);
}
Assert.assertEquals("ready for cleaning", compacts.get(0).getState());
// Check basic stats are updated for partition bkt=0, but not updated for partition bkt=1
partitions = Hive.get().getPartitions(hiveTable);
parameters = partitions.stream().filter(p -> p.getName().equals("bkt=0")).findFirst().orElseThrow(() -> new RuntimeException("Could not get Partition")).getParameters();
Assert.assertEquals("The number of files is differing from the expected", "1", parameters.get("numFiles"));
Assert.assertEquals("The number of rows is differing from the expected", "2", parameters.get("numRows"));
Assert.assertEquals("The total table size is differing from the expected", "801", parameters.get("totalSize"));
parameters = partitions.stream().filter(p -> p.getName().equals("bkt=1")).findFirst().orElseThrow(() -> new RuntimeException("Could not get Partition")).getParameters();
Assert.assertEquals("The number of files is differing from the expected", "2", parameters.get("numFiles"));
Assert.assertEquals("The number of rows is differing from the expected", "2", parameters.get("numRows"));
Assert.assertEquals("The total table size is differing from the expected", "1442", parameters.get("totalSize"));
}
Aggregations