use of org.apache.hadoop.io.SequenceFile.Reader in project incubator-systemml by apache.
the class TfUtils method getPartFileID.
/**
* Function to generate custom file names (transform-part-.....) for
* mappers' output for ApplyTfCSV job. The idea is to find the index
* of (thisfile, fileoffset) in the list of all offsets from the
* counters/offsets file, which was generated from either GenTfMtdMR
* or AssignRowIDMR job.
*
* @param job job configuration
* @param offset file offset
* @return part file id (ie, 00001, 00002, etc)
* @throws IOException if IOException occurs
*/
public String getPartFileID(JobConf job, long offset) throws IOException {
Reader reader = null;
int id = 0;
try {
reader = initOffsetsReader(job);
ByteWritable key = new ByteWritable();
OffsetCount value = new OffsetCount();
String thisFile = TfUtils.getPartFileName(job);
while (reader.next(key, value)) {
if (thisFile.equals(value.filename) && value.fileOffset == offset)
break;
id++;
}
} finally {
IOUtilFunctions.closeSilently(reader);
}
String sid = Integer.toString(id);
char[] carr = new char[5 - sid.length()];
Arrays.fill(carr, '0');
String ret = (new String(carr)).concat(sid);
return ret;
}
use of org.apache.hadoop.io.SequenceFile.Reader in project nifi by apache.
the class KeyValueReader method readSequenceFile.
@Override
public Set<FlowFile> readSequenceFile(Path file, Configuration configuration, FileSystem fileSystem) throws IOException {
final SequenceFile.Reader reader;
Set<FlowFile> flowFiles = new HashSet<>();
reader = new SequenceFile.Reader(configuration, Reader.file(fileSystem.makeQualified(file)));
final Text key = new Text();
final KeyValueWriterCallback callback = new KeyValueWriterCallback(reader);
final String inputfileName = file.getName() + "." + System.nanoTime() + ".";
int counter = 0;
LOG.debug("Read from SequenceFile: {} ", new Object[] { file });
try {
while (reader.next(key)) {
String fileName = key.toString();
// the key may be a file name, and may not
if (LOOKS_LIKE_FILENAME.matcher(fileName).matches()) {
if (fileName.contains(File.separator)) {
fileName = StringUtils.substringAfterLast(fileName, File.separator);
}
fileName = fileName + "." + System.nanoTime();
} else {
fileName = inputfileName + ++counter;
}
FlowFile flowFile = session.create();
flowFile = session.putAttribute(flowFile, CoreAttributes.FILENAME.key(), fileName);
callback.key = key;
try {
flowFile = session.write(flowFile, callback);
flowFiles.add(flowFile);
} catch (ProcessException e) {
LOG.error("Could not write to flowfile {}", new Object[] { flowFile }, e);
session.remove(flowFile);
}
key.clear();
}
} finally {
IOUtils.closeQuietly(reader);
}
return flowFiles;
}
use of org.apache.hadoop.io.SequenceFile.Reader in project nifi by apache.
the class ValueReader method readSequenceFile.
@Override
public Set<FlowFile> readSequenceFile(final Path file, Configuration configuration, FileSystem fileSystem) throws IOException {
Set<FlowFile> flowFiles = new HashSet<>();
final SequenceFile.Reader reader = new SequenceFile.Reader(configuration, Reader.file(fileSystem.makeQualified(file)));
final String inputfileName = file.getName() + "." + System.nanoTime() + ".";
int counter = 0;
LOG.debug("Reading from sequence file {}", new Object[] { file });
final OutputStreamWritableCallback writer = new OutputStreamWritableCallback(reader);
Text key = new Text();
try {
while (reader.next(key)) {
String fileName = key.toString();
// the key may be a file name, and may not
if (LOOKS_LIKE_FILENAME.matcher(fileName).matches()) {
if (fileName.contains(File.separator)) {
fileName = StringUtils.substringAfterLast(fileName, File.separator);
}
fileName = fileName + "." + System.nanoTime();
} else {
fileName = inputfileName + ++counter;
}
FlowFile flowFile = session.create();
flowFile = session.putAttribute(flowFile, CoreAttributes.FILENAME.key(), fileName);
try {
flowFile = session.write(flowFile, writer);
flowFiles.add(flowFile);
} catch (ProcessException e) {
LOG.error("Could not write to flowfile {}", new Object[] { flowFile }, e);
session.remove(flowFile);
}
key.clear();
}
} finally {
IOUtils.closeQuietly(reader);
}
return flowFiles;
}
use of org.apache.hadoop.io.SequenceFile.Reader in project spark-dataflow by cloudera.
the class HadoopFileFormatPipelineTest method testSequenceFile.
@Test
public void testSequenceFile() throws Exception {
populateFile();
Pipeline p = Pipeline.create(PipelineOptionsFactory.create());
@SuppressWarnings("unchecked") Class<? extends FileInputFormat<IntWritable, Text>> inputFormatClass = (Class<? extends FileInputFormat<IntWritable, Text>>) (Class<?>) SequenceFileInputFormat.class;
HadoopIO.Read.Bound<IntWritable, Text> read = HadoopIO.Read.from(inputFile.getAbsolutePath(), inputFormatClass, IntWritable.class, Text.class);
PCollection<KV<IntWritable, Text>> input = p.apply(read);
@SuppressWarnings("unchecked") Class<? extends FileOutputFormat<IntWritable, Text>> outputFormatClass = (Class<? extends FileOutputFormat<IntWritable, Text>>) (Class<?>) TemplatedSequenceFileOutputFormat.class;
@SuppressWarnings("unchecked") HadoopIO.Write.Bound<IntWritable, Text> write = HadoopIO.Write.to(outputFile.getAbsolutePath(), outputFormatClass, IntWritable.class, Text.class);
input.apply(write.withoutSharding());
EvaluationResult res = SparkPipelineRunner.create().run(p);
res.close();
IntWritable key = new IntWritable();
Text value = new Text();
try (Reader reader = new Reader(new Configuration(), Reader.file(new Path(outputFile.toURI())))) {
int i = 0;
while (reader.next(key, value)) {
assertEquals(i, key.get());
assertEquals("value-" + i, value.toString());
i++;
}
}
}
use of org.apache.hadoop.io.SequenceFile.Reader in project nutch by apache.
the class LinkReader method read.
@Override
public List read(String path) throws FileNotFoundException {
List<HashMap> rows = new ArrayList<>();
Path file = new Path(path);
SequenceFile.Reader reader;
try {
reader = new SequenceFile.Reader(conf, Reader.file(file));
Writable key = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
LinkDatum value = new LinkDatum();
while (reader.next(key, value)) {
try {
HashMap<String, String> t_row = getLinksRow(key, value);
rows.add(t_row);
} catch (Exception e) {
}
}
reader.close();
} catch (FileNotFoundException fne) {
throw new FileNotFoundException();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
LOG.error("Error occurred while reading file {} : ", file, StringUtils.stringifyException(e));
throw new WebApplicationException();
}
return rows;
}
Aggregations