use of org.apache.hadoop.mapred.InputFormat in project hadoop by apache.
the class DelegatingInputFormat method getSplits.
public InputSplit[] getSplits(JobConf conf, int numSplits) throws IOException {
JobConf confCopy = new JobConf(conf);
List<InputSplit> splits = new ArrayList<InputSplit>();
Map<Path, InputFormat> formatMap = MultipleInputs.getInputFormatMap(conf);
Map<Path, Class<? extends Mapper>> mapperMap = MultipleInputs.getMapperTypeMap(conf);
Map<Class<? extends InputFormat>, List<Path>> formatPaths = new HashMap<Class<? extends InputFormat>, List<Path>>();
// First, build a map of InputFormats to Paths
for (Entry<Path, InputFormat> entry : formatMap.entrySet()) {
if (!formatPaths.containsKey(entry.getValue().getClass())) {
formatPaths.put(entry.getValue().getClass(), new LinkedList<Path>());
}
formatPaths.get(entry.getValue().getClass()).add(entry.getKey());
}
for (Entry<Class<? extends InputFormat>, List<Path>> formatEntry : formatPaths.entrySet()) {
Class<? extends InputFormat> formatClass = formatEntry.getKey();
InputFormat format = (InputFormat) ReflectionUtils.newInstance(formatClass, conf);
List<Path> paths = formatEntry.getValue();
Map<Class<? extends Mapper>, List<Path>> mapperPaths = new HashMap<Class<? extends Mapper>, List<Path>>();
// a map of Mappers to the paths they're used for
for (Path path : paths) {
Class<? extends Mapper> mapperClass = mapperMap.get(path);
if (!mapperPaths.containsKey(mapperClass)) {
mapperPaths.put(mapperClass, new LinkedList<Path>());
}
mapperPaths.get(mapperClass).add(path);
}
// be added to the same job, and split together.
for (Entry<Class<? extends Mapper>, List<Path>> mapEntry : mapperPaths.entrySet()) {
paths = mapEntry.getValue();
Class<? extends Mapper> mapperClass = mapEntry.getKey();
if (mapperClass == null) {
mapperClass = conf.getMapperClass();
}
FileInputFormat.setInputPaths(confCopy, paths.toArray(new Path[paths.size()]));
// Get splits for each input path and tag with InputFormat
// and Mapper types by wrapping in a TaggedInputSplit.
InputSplit[] pathSplits = format.getSplits(confCopy, numSplits);
for (InputSplit pathSplit : pathSplits) {
splits.add(new TaggedInputSplit(pathSplit, conf, format.getClass(), mapperClass));
}
}
}
return splits.toArray(new InputSplit[splits.size()]);
}
use of org.apache.hadoop.mapred.InputFormat in project hadoop by apache.
the class TestMultipleInputs method testAddInputPathWithFormat.
@Test
public void testAddInputPathWithFormat() {
final JobConf conf = new JobConf();
MultipleInputs.addInputPath(conf, new Path("/foo"), TextInputFormat.class);
MultipleInputs.addInputPath(conf, new Path("/bar"), KeyValueTextInputFormat.class);
final Map<Path, InputFormat> inputs = MultipleInputs.getInputFormatMap(conf);
assertEquals(TextInputFormat.class, inputs.get(new Path("/foo")).getClass());
assertEquals(KeyValueTextInputFormat.class, inputs.get(new Path("/bar")).getClass());
}
use of org.apache.hadoop.mapred.InputFormat in project presto by prestodb.
the class HiveUtil method createRecordReader.
public static RecordReader<?, ?> createRecordReader(Configuration configuration, Path path, long start, long length, Properties schema, List<HiveColumnHandle> columns) {
// determine which hive columns we will read
List<HiveColumnHandle> readColumns = ImmutableList.copyOf(filter(columns, column -> column.getColumnType() == REGULAR));
List<Integer> readHiveColumnIndexes = ImmutableList.copyOf(transform(readColumns, HiveColumnHandle::getHiveColumnIndex));
// Tell hive the columns we would like to read, this lets hive optimize reading column oriented files
setReadColumns(configuration, readHiveColumnIndexes);
InputFormat<?, ?> inputFormat = getInputFormat(configuration, schema, true);
JobConf jobConf = new JobConf(configuration);
FileSplit fileSplit = new FileSplit(path, start, length, (String[]) null);
// propagate serialization configuration to getRecordReader
schema.stringPropertyNames().stream().filter(name -> name.startsWith("serialization.")).forEach(name -> jobConf.set(name, schema.getProperty(name)));
try {
return retry().stopOnIllegalExceptions().run("createRecordReader", () -> inputFormat.getRecordReader(fileSplit, jobConf, Reporter.NULL));
} catch (Exception e) {
throw new PrestoException(HIVE_CANNOT_OPEN_SPLIT, format("Error opening Hive split %s (offset=%s, length=%s) using %s: %s", path, start, length, getInputFormatName(schema), e.getMessage()), e);
}
}
use of org.apache.hadoop.mapred.InputFormat in project drill by apache.
the class ConvertHiveParquetScanToDrillParquetScan method getInputFormatFromSD.
/**
* Get the input format from given {@link StorageDescriptor}
* @param properties
* @param hiveReadEntry
* @param sd
* @return {@link InputFormat} class or null if a failure has occurred. Failure is logged as warning.
*/
private Class<? extends InputFormat<?, ?>> getInputFormatFromSD(final Properties properties, final HiveReadEntry hiveReadEntry, final StorageDescriptor sd, final HiveConf hiveConf) {
final Table hiveTable = hiveReadEntry.getTable();
try {
final String inputFormatName = sd.getInputFormat();
if (!Strings.isNullOrEmpty(inputFormatName)) {
return (Class<? extends InputFormat<?, ?>>) Class.forName(inputFormatName);
}
final JobConf job = new JobConf(hiveConf);
HiveUtilities.addConfToJob(job, properties);
return HiveUtilities.getInputFormatClass(job, sd, hiveTable);
} catch (final Exception e) {
logger.warn("Failed to get InputFormat class from Hive table '{}.{}'. StorageDescriptor [{}]", hiveTable.getDbName(), hiveTable.getTableName(), sd.toString(), e);
return null;
}
}
use of org.apache.hadoop.mapred.InputFormat in project drill by apache.
the class HiveMetadataProvider method splitInputWithUGI.
private List<InputSplitWrapper> splitInputWithUGI(final Properties properties, final StorageDescriptor sd, final Partition partition) throws Exception {
watch.start();
try {
return ugi.doAs(new PrivilegedExceptionAction<List<InputSplitWrapper>>() {
public List<InputSplitWrapper> run() throws Exception {
final List<InputSplitWrapper> splits = Lists.newArrayList();
final JobConf job = new JobConf(hiveConf);
HiveUtilities.addConfToJob(job, properties);
job.setInputFormat(HiveUtilities.getInputFormatClass(job, sd, hiveReadEntry.getTable()));
final Path path = new Path(sd.getLocation());
final FileSystem fs = path.getFileSystem(job);
if (fs.exists(path)) {
FileInputFormat.addInputPath(job, path);
final InputFormat<?, ?> format = job.getInputFormat();
for (final InputSplit split : format.getSplits(job, 1)) {
splits.add(new InputSplitWrapper(split, partition));
}
}
return splits;
}
});
} catch (final InterruptedException | IOException e) {
final String errMsg = String.format("Failed to create input splits: %s", e.getMessage());
logger.error(errMsg, e);
throw new DrillRuntimeException(errMsg, e);
} finally {
logger.trace("Took {} µs to get splits from {}", watch.elapsed(TimeUnit.NANOSECONDS) / 1000, sd.getLocation());
watch.stop();
}
}
Aggregations