use of org.apache.hadoop.mapred.TextInputFormat in project flink by apache.
the class WordCountMapredITCase method internalRun.
private void internalRun(boolean isTestDeprecatedAPI) throws Exception {
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
DataSet<Tuple2<LongWritable, Text>> input;
if (isTestDeprecatedAPI) {
input = env.readHadoopFile(new TextInputFormat(), LongWritable.class, Text.class, textPath);
} else {
input = env.createInput(readHadoopFile(new TextInputFormat(), LongWritable.class, Text.class, textPath));
}
DataSet<String> text = input.map(new MapFunction<Tuple2<LongWritable, Text>, String>() {
@Override
public String map(Tuple2<LongWritable, Text> value) throws Exception {
return value.f1.toString();
}
});
DataSet<Tuple2<String, Integer>> counts = // split up the lines in pairs (2-tuples) containing: (word,1)
text.flatMap(new Tokenizer()).groupBy(0).sum(1);
DataSet<Tuple2<Text, LongWritable>> words = counts.map(new MapFunction<Tuple2<String, Integer>, Tuple2<Text, LongWritable>>() {
@Override
public Tuple2<Text, LongWritable> map(Tuple2<String, Integer> value) throws Exception {
return new Tuple2<Text, LongWritable>(new Text(value.f0), new LongWritable(value.f1));
}
});
// Set up Hadoop Output Format
HadoopOutputFormat<Text, LongWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, LongWritable>(new TextOutputFormat<Text, LongWritable>(), new JobConf());
hadoopOutputFormat.getJobConf().set("mapred.textoutputformat.separator", " ");
TextOutputFormat.setOutputPath(hadoopOutputFormat.getJobConf(), new Path(resultPath));
// Output & Execute
words.output(hadoopOutputFormat);
env.execute("Hadoop Compat WordCount");
}
use of org.apache.hadoop.mapred.TextInputFormat in project SQLWindowing by hbutani.
the class IOUtils method createFileWindowingInput.
@SuppressWarnings("unchecked")
public static WindowingInput createFileWindowingInput(String path, String inputFormatClassName, String serDeClassName, Properties serDeProperties, Configuration conf) throws WindowingException {
try {
HiveConf hConf = new HiveConf(conf, IOUtils.class);
JobConf job = new JobConf(hConf);
Path p = new Path(path);
p = makeQualified(p, conf);
Class<? extends InputFormat<? extends Writable, ? extends Writable>> inputFormatClass = (Class<? extends InputFormat<? extends Writable, ? extends Writable>>) Class.forName(inputFormatClassName);
hConf.setClass("mapred.input.format.class", inputFormatClass, InputFormat.class);
hConf.set(INPUT_INPUTFORMAT_CLASS, inputFormatClass.getName());
InputFormat<? extends Writable, ? extends Writable> iFmt = inputFormatClass.newInstance();
if (iFmt instanceof TextInputFormat) {
((TextInputFormat) iFmt).configure(job);
}
FileInputFormat.addInputPath(job, p);
InputSplit[] iSplits = iFmt.getSplits(job, 1);
org.apache.hadoop.mapred.RecordReader<Writable, Writable> rdr = (org.apache.hadoop.mapred.RecordReader<Writable, Writable>) iFmt.getRecordReader(iSplits[0], job, Reporter.NULL);
hConf.set(INPUT_PATH, path);
hConf.set(INPUT_KEY_CLASS, rdr.createKey().getClass().getName());
hConf.set(INPUT_VALUE_CLASS, rdr.createValue().getClass().getName());
hConf.set(INPUT_SERDE_CLASS, serDeClassName);
TableWindowingInput tIn = new TableWindowingInput();
tIn.initialize(null, hConf, serDeProperties);
return tIn;
} catch (Exception e) {
throw new WindowingException(e);
}
}
use of org.apache.hadoop.mapred.TextInputFormat in project SQLWindowing by hbutani.
the class TableWindowingInput method initialize.
@SuppressWarnings("unchecked")
@Override
public void initialize(InputStream ins, Configuration conf, Properties tbl) throws IOException {
try {
setupSerDe(conf, tbl);
tableDirPath = new Path(conf.get(INPUT_PATH));
keyClass = (Class<? extends Writable>) Class.forName(conf.get(INPUT_KEY_CLASS));
valueClass = (Class<? extends Writable>) Class.forName(conf.get(INPUT_VALUE_CLASS));
inputFormatClass = (Class<? extends InputFormat<? extends Writable, ? extends Writable>>) Class.forName(conf.get(INPUT_INPUTFORMAT_CLASS));
FileSystem fs = FileSystem.get(tableDirPath.toUri(), conf);
FileStatus tableDir = fs.getFileStatus(tableDirPath);
assert tableDir.isDir();
tableFiles = fs.listStatus(tableDirPath);
jconf = new JobConf();
iFmt = inputFormatClass.newInstance();
if (iFmt instanceof TextInputFormat)
((TextInputFormat) iFmt).configure(jconf);
jconf.set("mapred.input.dir", tableDirPath.toString());
currSplitNum = 0;
iSplits = iFmt.getSplits(jconf, 1);
nextSplit();
} catch (Exception e) {
throw new IOException(e);
}
}
use of org.apache.hadoop.mapred.TextInputFormat in project presto by prestodb.
the class BackgroundHiveSplitLoader method loadPartition.
private void loadPartition(HivePartitionMetadata partition) throws IOException {
String partitionName = partition.getHivePartition().getPartitionId();
Properties schema = getPartitionSchema(table, partition.getPartition());
List<HivePartitionKey> partitionKeys = getPartitionKeys(table, partition.getPartition());
TupleDomain<HiveColumnHandle> effectivePredicate = partition.getHivePartition().getEffectivePredicate();
Path path = new Path(getPartitionLocation(table, partition.getPartition()));
Configuration configuration = hdfsEnvironment.getConfiguration(path);
InputFormat<?, ?> inputFormat = getInputFormat(configuration, schema, false);
FileSystem fs = hdfsEnvironment.getFileSystem(session.getUser(), path);
if (inputFormat instanceof SymlinkTextInputFormat) {
if (bucketHandle.isPresent()) {
throw new PrestoException(StandardErrorCode.NOT_SUPPORTED, "Bucketed table in SymlinkTextInputFormat is not yet supported");
}
// TODO: This should use an iterator like the HiveFileIterator
for (Path targetPath : getTargetPathsFromSymlink(fs, path)) {
// The input should be in TextInputFormat.
TextInputFormat targetInputFormat = new TextInputFormat();
// get the configuration for the target path -- it may be a different hdfs instance
Configuration targetConfiguration = hdfsEnvironment.getConfiguration(targetPath);
JobConf targetJob = new JobConf(targetConfiguration);
targetJob.setInputFormat(TextInputFormat.class);
targetInputFormat.configure(targetJob);
FileInputFormat.setInputPaths(targetJob, targetPath);
InputSplit[] targetSplits = targetInputFormat.getSplits(targetJob, 0);
if (addSplitsToSource(targetSplits, partitionName, partitionKeys, schema, effectivePredicate, partition.getColumnCoercions())) {
return;
}
}
return;
}
// on the input format to obtain file splits.
if (shouldUseFileSplitsFromInputFormat(inputFormat)) {
JobConf jobConf = new JobConf(configuration);
FileInputFormat.setInputPaths(jobConf, path);
InputSplit[] splits = inputFormat.getSplits(jobConf, 0);
addSplitsToSource(splits, partitionName, partitionKeys, schema, effectivePredicate, partition.getColumnCoercions());
return;
}
// If only one bucket could match: load that one file
HiveFileIterator iterator = new HiveFileIterator(path, fs, directoryLister, namenodeStats, partitionName, inputFormat, schema, partitionKeys, effectivePredicate, partition.getColumnCoercions());
if (!buckets.isEmpty()) {
int bucketCount = buckets.get(0).getBucketCount();
List<LocatedFileStatus> list = listAndSortBucketFiles(iterator, bucketCount);
List<Iterator<HiveSplit>> iteratorList = new ArrayList<>();
for (HiveBucket bucket : buckets) {
int bucketNumber = bucket.getBucketNumber();
LocatedFileStatus file = list.get(bucketNumber);
boolean splittable = isSplittable(iterator.getInputFormat(), hdfsEnvironment.getFileSystem(session.getUser(), file.getPath()), file.getPath());
iteratorList.add(createHiveSplitIterator(iterator.getPartitionName(), file.getPath().toString(), file.getBlockLocations(), 0, file.getLen(), iterator.getSchema(), iterator.getPartitionKeys(), splittable, session, OptionalInt.of(bucketNumber), effectivePredicate, partition.getColumnCoercions()));
}
addToHiveSplitSourceRoundRobin(iteratorList);
return;
}
// If table is bucketed: list the directory, sort, tag with bucket id
if (bucketHandle.isPresent()) {
// HiveFileIterator skips hidden files automatically.
int bucketCount = bucketHandle.get().getBucketCount();
List<LocatedFileStatus> list = listAndSortBucketFiles(iterator, bucketCount);
List<Iterator<HiveSplit>> iteratorList = new ArrayList<>();
for (int bucketIndex = 0; bucketIndex < bucketCount; bucketIndex++) {
LocatedFileStatus file = list.get(bucketIndex);
boolean splittable = isSplittable(iterator.getInputFormat(), hdfsEnvironment.getFileSystem(session.getUser(), file.getPath()), file.getPath());
iteratorList.add(createHiveSplitIterator(iterator.getPartitionName(), file.getPath().toString(), file.getBlockLocations(), 0, file.getLen(), iterator.getSchema(), iterator.getPartitionKeys(), splittable, session, OptionalInt.of(bucketIndex), iterator.getEffectivePredicate(), partition.getColumnCoercions()));
}
addToHiveSplitSourceRoundRobin(iteratorList);
return;
}
fileIterators.addLast(iterator);
}
use of org.apache.hadoop.mapred.TextInputFormat in project hive by apache.
the class VectorDeserializeOrcWriter method create.
// TODO: if more writers are added, separate out an EncodingWriterFactory
public static EncodingWriter create(InputFormat<?, ?> sourceIf, Deserializer serDe, Map<Path, PartitionDesc> parts, Configuration daemonConf, Configuration jobConf, Path splitPath, StructObjectInspector sourceOi, List<Integer> sourceIncludes, boolean[] cacheIncludes, int allocSize) throws IOException {
// Vector SerDe can be disabled both on client and server side.
if (!HiveConf.getBoolVar(daemonConf, ConfVars.LLAP_IO_ENCODE_VECTOR_SERDE_ENABLED) || !HiveConf.getBoolVar(jobConf, ConfVars.LLAP_IO_ENCODE_VECTOR_SERDE_ENABLED) || !(sourceIf instanceof TextInputFormat) || !(serDe instanceof LazySimpleSerDe)) {
return new DeserializerOrcWriter(serDe, sourceOi, allocSize);
}
Path path = splitPath.getFileSystem(daemonConf).makeQualified(splitPath);
PartitionDesc partDesc = HiveFileFormatUtils.getFromPathRecursively(parts, path, null);
if (partDesc == null) {
LlapIoImpl.LOG.info("Not using VertorDeserializeOrcWriter: no partition desc for " + path);
return new DeserializerOrcWriter(serDe, sourceOi, allocSize);
}
Properties tblProps = partDesc.getTableDesc().getProperties();
if ("true".equalsIgnoreCase(tblProps.getProperty(serdeConstants.SERIALIZATION_LAST_COLUMN_TAKES_REST))) {
LlapIoImpl.LOG.info("Not using VertorDeserializeOrcWriter due to " + serdeConstants.SERIALIZATION_LAST_COLUMN_TAKES_REST);
return new DeserializerOrcWriter(serDe, sourceOi, allocSize);
}
for (StructField sf : sourceOi.getAllStructFieldRefs()) {
Category c = sf.getFieldObjectInspector().getCategory();
if (c != Category.PRIMITIVE) {
LlapIoImpl.LOG.info("Not using VertorDeserializeOrcWriter: " + c + " is not supported");
return new DeserializerOrcWriter(serDe, sourceOi, allocSize);
}
}
LlapIoImpl.LOG.info("Creating VertorDeserializeOrcWriter for " + path);
return new VectorDeserializeOrcWriter(daemonConf, tblProps, sourceOi, sourceIncludes, cacheIncludes, allocSize);
}
Aggregations