use of org.apache.hadoop.mapred.FileSplit in project hive by apache.
the class CustomPartitionVertex method onRootVertexInitialized.
// One call per root Input
@Override
public void onRootVertexInitialized(String inputName, InputDescriptor inputDescriptor, List<Event> events) {
numInputsSeenSoFar++;
LOG.info("On root vertex initialized " + inputName);
try {
// This is using the payload from the RootVertexInitializer corresponding
// to InputName. Ideally it should be using it's own configuration class -
// but that
// means serializing another instance.
MRInputUserPayloadProto protoPayload = MRInputHelpers.parseMRInputPayload(inputDescriptor.getUserPayload());
this.conf = TezUtils.createConfFromByteString(protoPayload.getConfigurationBytes());
/*
* Currently in tez, the flow of events is thus:
* "Generate Splits -> Initialize Vertex" (with parallelism info obtained
* from the generate splits phase). The generate splits phase groups
* splits using the TezGroupedSplitsInputFormat. However, for bucket map
* joins the grouping done by this input format results in incorrect
* results as the grouper has no knowledge of buckets. So, we initially
* set the input format to be HiveInputFormat (in DagUtils) for the case
* of bucket map joins so as to obtain un-grouped splits. We then group
* the splits corresponding to buckets using the tez grouper which returns
* TezGroupedSplits.
*/
// This assumes that Grouping will always be used.
// Enabling grouping on the payload.
MRInputUserPayloadProto updatedPayload = MRInputUserPayloadProto.newBuilder(protoPayload).setGroupingEnabled(true).build();
inputDescriptor.setUserPayload(UserPayload.create(updatedPayload.toByteString().asReadOnlyByteBuffer()));
} catch (IOException e) {
throw new RuntimeException(e);
}
boolean dataInformationEventSeen = false;
Map<String, Set<FileSplit>> pathFileSplitsMap = new TreeMap<String, Set<FileSplit>>();
for (Event event : events) {
if (event instanceof InputConfigureVertexTasksEvent) {
// No tasks should have been started yet. Checked by initial state
// check.
LOG.info("Got a input configure vertex event for input: " + inputName);
Preconditions.checkState(dataInformationEventSeen == false);
InputConfigureVertexTasksEvent cEvent = (InputConfigureVertexTasksEvent) event;
// The vertex cannot be configured until all DataEvents are seen - to
// build the routing table.
configureVertexTaskEvent = cEvent;
LOG.info("Configure task for input name: " + inputName + " num tasks: " + configureVertexTaskEvent.getNumTasks());
}
if (event instanceof InputUpdatePayloadEvent) {
// this event can never occur. If it does, fail.
Preconditions.checkState(false);
} else if (event instanceof InputDataInformationEvent) {
dataInformationEventSeen = true;
InputDataInformationEvent diEvent = (InputDataInformationEvent) event;
FileSplit fileSplit;
try {
fileSplit = getFileSplitFromEvent(diEvent);
} catch (IOException e) {
throw new RuntimeException("Failed to get file split for event: " + diEvent, e);
}
Set<FileSplit> fsList = pathFileSplitsMap.get(Utilities.getBucketFileNameFromPathSubString(fileSplit.getPath().getName()));
if (fsList == null) {
fsList = new TreeSet<FileSplit>(new PathComparatorForSplit());
pathFileSplitsMap.put(Utilities.getBucketFileNameFromPathSubString(fileSplit.getPath().getName()), fsList);
}
fsList.add(fileSplit);
}
}
LOG.debug("Path file splits map for input name: {} is {}", inputName, pathFileSplitsMap);
Multimap<Integer, InputSplit> bucketToInitialSplitMap = getBucketSplitMapForPath(inputName, pathFileSplitsMap);
try {
int totalResource = context.getTotalAvailableResource().getMemory();
int taskResource = context.getVertexTaskResource().getMemory();
float waves = conf.getFloat(TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_WAVES, TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_WAVES_DEFAULT);
int availableSlots = totalResource / taskResource;
LOG.debug("Grouping splits. {} available slots, {} waves. Bucket initial splits map: {}", availableSlots, waves, bucketToInitialSplitMap);
JobConf jobConf = new JobConf(conf);
ShimLoader.getHadoopShims().getMergedCredentials(jobConf);
Multimap<Integer, InputSplit> bucketToGroupedSplitMap = HashMultimap.<Integer, InputSplit>create();
boolean secondLevelGroupingDone = false;
if ((mainWorkName.isEmpty()) || (inputName.compareTo(mainWorkName) == 0)) {
SplitLocationProvider splitLocationProvider = Utils.getSplitLocationProvider(conf, LOG);
for (Integer key : bucketToInitialSplitMap.keySet()) {
InputSplit[] inputSplitArray = (bucketToInitialSplitMap.get(key).toArray(new InputSplit[0]));
Multimap<Integer, InputSplit> groupedSplit = grouper.generateGroupedSplits(jobConf, conf, inputSplitArray, waves, availableSlots, inputName, mainWorkName.isEmpty(), splitLocationProvider);
if (mainWorkName.isEmpty() == false) {
Multimap<Integer, InputSplit> singleBucketToGroupedSplit = HashMultimap.<Integer, InputSplit>create();
singleBucketToGroupedSplit.putAll(key, groupedSplit.values());
groupedSplit = grouper.group(jobConf, singleBucketToGroupedSplit, availableSlots, HiveConf.getFloatVar(conf, HiveConf.ConfVars.TEZ_SMB_NUMBER_WAVES), null);
secondLevelGroupingDone = true;
}
bucketToGroupedSplitMap.putAll(key, groupedSplit.values());
}
processAllEvents(inputName, bucketToGroupedSplitMap, secondLevelGroupingDone);
} else {
SplitLocationProvider splitLocationProvider = Utils.getSplitLocationProvider(conf, LOG);
// all the bucket files.
for (Integer key : bucketToInitialSplitMap.keySet()) {
InputSplit[] inputSplitArray = (bucketToInitialSplitMap.get(key).toArray(new InputSplit[0]));
Multimap<Integer, InputSplit> groupedSplit = grouper.generateGroupedSplits(jobConf, conf, inputSplitArray, waves, availableSlots, inputName, false, splitLocationProvider);
bucketToGroupedSplitMap.putAll(key, groupedSplit.values());
}
/*
* this is the small table side. In case of SMB join, we need to send each split to the
* corresponding bucket-based task on the other side. In case a split needs to go to
* multiple downstream tasks, we need to clone the event and send it to the right
* destination.
*/
LOG.info("This is the side work - multi-mr work.");
processAllSideEventsSetParallelism(inputName, bucketToGroupedSplitMap);
}
} catch (Exception e) {
throw new RuntimeException(e);
}
}
use of org.apache.hadoop.mapred.FileSplit in project hive by apache.
the class HiveSplitGenerator method initialize.
@SuppressWarnings("unchecked")
@Override
public List<Event> initialize() throws Exception {
if (getContext() != null) {
// called from Tez AM.
prepare(getContext());
}
// Setup the map work for this thread. Pruning modified the work instance to potentially remove
// partitions. The same work instance must be used when generating splits.
Utilities.setMapWork(jobConf, work);
try {
boolean sendSerializedEvents = conf.getBoolean("mapreduce.tez.input.initializer.serialize.event.payload", true);
// perform dynamic partition pruning
if (pruner != null) {
pruner.initialize(getContext(), work, jobConf);
pruner.prune();
}
InputSplitInfoMem inputSplitInfo = null;
boolean generateConsistentSplits = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_TEZ_GENERATE_CONSISTENT_SPLITS);
LOG.info("GenerateConsistentSplitsInHive=" + generateConsistentSplits);
String realInputFormatName = conf.get("mapred.input.format.class");
boolean groupingEnabled = userPayloadProto.getGroupingEnabled();
if (groupingEnabled) {
// Need to instantiate the realInputFormat
InputFormat<?, ?> inputFormat = (InputFormat<?, ?>) ReflectionUtils.newInstance(JavaUtils.loadClass(realInputFormatName), jobConf);
int totalResource = 0;
int taskResource = 0;
int availableSlots = 0;
// FIXME. Do the right thing Luke.
if (getContext() == null) {
// for now, totalResource = taskResource for llap
availableSlots = 1;
}
if (getContext() != null) {
totalResource = getContext().getTotalAvailableResource().getMemory();
taskResource = getContext().getVertexTaskResource().getMemory();
availableSlots = totalResource / taskResource;
}
if (HiveConf.getLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, 1) <= 1) {
// broken configuration from mapred-default.xml
final long blockSize = conf.getLongBytes(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, DFSConfigKeys.DFS_BLOCK_SIZE_DEFAULT);
final long minGrouping = conf.getLong(TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_MIN_SIZE, TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_MIN_SIZE_DEFAULT);
final long preferredSplitSize = Math.min(blockSize / 2, minGrouping);
HiveConf.setLongVar(jobConf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, preferredSplitSize);
LOG.info("The preferred split size is " + preferredSplitSize);
}
float waves;
// Create the un-grouped splits
if (numSplits.isPresent()) {
waves = numSplits.get().floatValue() / availableSlots;
} else {
waves = conf.getFloat(TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_WAVES, TezMapReduceSplitsGrouper.TEZ_GROUPING_SPLIT_WAVES_DEFAULT);
}
InputSplit[] splits;
if (generateSingleSplit && conf.get(HiveConf.ConfVars.HIVETEZINPUTFORMAT.varname).equals(HiveInputFormat.class.getName())) {
MapWork mapWork = Utilities.getMapWork(jobConf);
List<Path> paths = Utilities.getInputPathsTez(jobConf, mapWork);
FileSystem fs = paths.get(0).getFileSystem(jobConf);
FileStatus[] fileStatuses = fs.listStatus(paths.get(0));
if (fileStatuses.length == 0) {
// generate single split typically happens when reading data out of order by queries.
// if order by query returns no rows, no files will exists in input path
splits = new InputSplit[0];
} else {
// if files exists in input path then it has to be 1 as this code path gets triggered only
// of order by queries which is expected to write only one file (written by one reducer)
Preconditions.checkState(paths.size() == 1 && fileStatuses.length == 1 && mapWork.getAliasToPartnInfo().size() == 1, "Requested to generate single split. Paths and fileStatuses are expected to be 1. " + "Got paths: " + paths.size() + " fileStatuses: " + fileStatuses.length);
splits = new InputSplit[1];
FileStatus fileStatus = fileStatuses[0];
BlockLocation[] locations = fs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen());
Set<String> hostsSet = new HashSet<>();
for (BlockLocation location : locations) {
hostsSet.addAll(Lists.newArrayList(location.getHosts()));
}
String[] hosts = hostsSet.toArray(new String[0]);
FileSplit fileSplit = new FileSplit(fileStatus.getPath(), 0, fileStatus.getLen(), hosts);
String alias = mapWork.getAliases().get(0);
PartitionDesc partDesc = mapWork.getAliasToPartnInfo().get(alias);
String partIF = partDesc.getInputFileFormatClassName();
splits[0] = new HiveInputFormat.HiveInputSplit(fileSplit, partIF);
}
} else {
// Raw splits
splits = inputFormat.getSplits(jobConf, numSplits.orElse(Math.multiplyExact(availableSlots, (int) waves)));
}
// Sort the splits, so that subsequent grouping is consistent.
Arrays.sort(splits, new InputSplitComparator());
LOG.info("Number of input splits: " + splits.length + ". " + availableSlots + " available slots, " + waves + " waves. Input format is: " + realInputFormatName);
// increment/set input counters
InputInitializerContext inputInitializerContext = getContext();
TezCounters tezCounters = null;
String counterName;
String groupName = null;
String vertexName = null;
if (inputInitializerContext != null) {
try {
tezCounters = new TezCounters();
groupName = HiveInputCounters.class.getName();
vertexName = jobConf.get(Operator.CONTEXT_NAME_KEY, "");
counterName = Utilities.getVertexCounterName(HiveInputCounters.RAW_INPUT_SPLITS.name(), vertexName);
tezCounters.findCounter(groupName, counterName).increment(splits.length);
final List<Path> paths = Utilities.getInputPathsTez(jobConf, work);
counterName = Utilities.getVertexCounterName(HiveInputCounters.INPUT_DIRECTORIES.name(), vertexName);
tezCounters.findCounter(groupName, counterName).increment(paths.size());
final Set<String> files = new HashSet<>();
for (InputSplit inputSplit : splits) {
if (inputSplit instanceof FileSplit) {
final FileSplit fileSplit = (FileSplit) inputSplit;
final Path path = fileSplit.getPath();
// The assumption here is the path is a file. Only case this is different is ACID deltas.
// The isFile check is avoided here for performance reasons.
final String fileStr = path.toString();
if (!files.contains(fileStr)) {
files.add(fileStr);
}
}
}
counterName = Utilities.getVertexCounterName(HiveInputCounters.INPUT_FILES.name(), vertexName);
tezCounters.findCounter(groupName, counterName).increment(files.size());
} catch (Exception e) {
LOG.warn("Caught exception while trying to update Tez counters", e);
}
}
if (work.getIncludedBuckets() != null) {
splits = pruneBuckets(work, splits);
}
Multimap<Integer, InputSplit> groupedSplits = splitGrouper.generateGroupedSplits(jobConf, conf, splits, waves, availableSlots, splitLocationProvider);
// And finally return them in a flat array
InputSplit[] flatSplits = groupedSplits.values().toArray(new InputSplit[0]);
LOG.info("Number of split groups: " + flatSplits.length);
if (inputInitializerContext != null) {
try {
counterName = Utilities.getVertexCounterName(HiveInputCounters.GROUPED_INPUT_SPLITS.name(), vertexName);
tezCounters.findCounter(groupName, counterName).setValue(flatSplits.length);
LOG.debug("Published tez counters: {}", tezCounters);
inputInitializerContext.addCounters(tezCounters);
} catch (Exception e) {
LOG.warn("Caught exception while trying to update Tez counters", e);
}
}
List<TaskLocationHint> locationHints = splitGrouper.createTaskLocationHints(flatSplits, generateConsistentSplits);
inputSplitInfo = new InputSplitInfoMem(flatSplits, locationHints, flatSplits.length, null, jobConf);
} else {
// If this is used in the future - make sure to disable grouping in the payload, if it isn't already disabled
throw new RuntimeException("HiveInputFormat does not support non-grouped splits, InputFormatName is: " + realInputFormatName);
// inputSplitInfo = MRInputHelpers.generateInputSplitsToMem(jobConf, false, 0);
}
return createEventList(sendSerializedEvents, inputSplitInfo);
} finally {
Utilities.clearWork(jobConf);
}
}
use of org.apache.hadoop.mapred.FileSplit in project hive by apache.
the class HostAffinitySplitLocationProvider method getLocations.
@Override
public String[] getLocations(InputSplit split) throws IOException {
if (!(split instanceof FileSplit)) {
LOG.debug("Split: {} is not a FileSplit. Using default locations", split);
return split.getLocations();
}
FileSplit fsplit = (FileSplit) split;
String location = locations.get(determineLocation(locations, fsplit));
return (location != null) ? new String[] { location } : null;
}
use of org.apache.hadoop.mapred.FileSplit in project hive by apache.
the class Base64TextInputFormat method getRecordReader.
public RecordReader<LongWritable, BytesWritable> getRecordReader(InputSplit genericSplit, JobConf job, Reporter reporter) throws IOException {
reporter.setStatus(genericSplit.toString());
Base64LineRecordReader reader = new Base64LineRecordReader(new LineRecordReader(job, (FileSplit) genericSplit));
reader.configure(job);
return reader;
}
use of org.apache.hadoop.mapred.FileSplit in project hive by apache.
the class RCFileCat method run.
@Override
public int run(String[] args) throws Exception {
long start = 0l;
long length = -1l;
int recordCount = 0;
long startT = System.currentTimeMillis();
boolean verbose = false;
boolean columnSizes = false;
boolean pretty = false;
boolean fileSizes = false;
// get options from arguments
if (args.length < 1 || args.length > 3) {
printUsage(null);
return -1;
}
Path fileName = null;
for (int i = 0; i < args.length; i++) {
String arg = args[i];
if (arg.startsWith("--start=")) {
start = Long.parseLong(arg.substring("--start=".length()));
} else if (arg.startsWith("--length=")) {
length = Long.parseLong(arg.substring("--length=".length()));
} else if (arg.equals("--verbose")) {
verbose = true;
} else if (arg.equals("--column-sizes")) {
columnSizes = true;
} else if (arg.equals("--column-sizes-pretty")) {
columnSizes = true;
pretty = true;
} else if (arg.equals("--file-sizes")) {
fileSizes = true;
} else if (fileName == null) {
fileName = new Path(arg);
} else {
printUsage(null);
return -1;
}
}
setupBufferedOutput();
FileSystem fs = FileSystem.get(fileName.toUri(), conf);
long fileLen = fs.getFileStatus(fileName).getLen();
if (start < 0) {
start = 0;
}
if (start > fileLen) {
return 0;
}
if (length < 0 || (start + length) > fileLen) {
length = fileLen - start;
}
// share the code with RecordReader.
FileSplit split = new FileSplit(fileName, start, length, new JobConf(conf));
RCFileRecordReader recordReader = new RCFileRecordReader(conf, split);
if (columnSizes || fileSizes) {
// Print out the un/compressed sizes of each column
long[] compressedColumnSizes = null;
long[] uncompressedColumnSizes = null;
// un/compressed sizes of file and no. of rows
long rowNo = 0;
long uncompressedFileSize = 0;
long compressedFileSize = 0;
// Skip from block to block since we only need the header
while (recordReader.nextBlock()) {
// Get the sizes from the key buffer and aggregate
KeyBuffer keyBuffer = recordReader.getKeyBuffer();
if (uncompressedColumnSizes == null) {
uncompressedColumnSizes = new long[keyBuffer.getColumnNumber()];
}
if (compressedColumnSizes == null) {
compressedColumnSizes = new long[keyBuffer.getColumnNumber()];
}
for (int i = 0; i < keyBuffer.getColumnNumber(); i++) {
uncompressedColumnSizes[i] += keyBuffer.getEachColumnUncompressedValueLen()[i];
compressedColumnSizes[i] += keyBuffer.getEachColumnValueLen()[i];
}
rowNo += keyBuffer.getNumberRows();
}
if (columnSizes && uncompressedColumnSizes != null && compressedColumnSizes != null) {
// otherwise print it out as if it were a row
for (int i = 0; i < uncompressedColumnSizes.length; i++) {
if (pretty) {
System.out.println("Column " + i + ": Uncompressed size: " + uncompressedColumnSizes[i] + " Compressed size: " + compressedColumnSizes[i]);
} else {
System.out.print(i + TAB + uncompressedColumnSizes[i] + TAB + compressedColumnSizes[i] + NEWLINE);
}
}
}
if (fileSizes) {
if (uncompressedColumnSizes != null && compressedColumnSizes != null) {
for (int i = 0; i < uncompressedColumnSizes.length; i++) {
uncompressedFileSize += uncompressedColumnSizes[i];
compressedFileSize += compressedColumnSizes[i];
}
}
System.out.print("File size (uncompressed): " + uncompressedFileSize + ". File size (compressed): " + compressedFileSize + ". Number of rows: " + rowNo + "." + NEWLINE);
}
System.out.flush();
return 0;
}
LongWritable key = new LongWritable();
BytesRefArrayWritable value = new BytesRefArrayWritable();
// extra capacity in case we overrun, to avoid resizing
StringBuilder buf = new StringBuilder(STRING_BUFFER_SIZE);
while (recordReader.next(key, value)) {
printRecord(value, buf);
recordCount++;
if (verbose && (recordCount % RECORD_PRINT_INTERVAL) == 0) {
long now = System.currentTimeMillis();
System.err.println("Read " + recordCount / 1024 + "k records");
System.err.println("Read " + ((recordReader.getPos() / (1024L * 1024L))) + "MB");
System.err.printf("Input scan rate %.2f MB/s\n", (recordReader.getPos() * 1.0 / (now - startT)) / 1024.0);
}
if (buf.length() > STRING_BUFFER_FLUSH_SIZE) {
System.out.print(buf.toString());
buf.setLength(0);
}
}
// print out last part of buffer
System.out.print(buf.toString());
System.out.flush();
return 0;
}
Aggregations