Search in sources :

Example 1 with Reader

use of org.apache.hadoop.io.SequenceFile.Reader in project incubator-systemml by apache.

the class TfUtils method getPartFileID.

/**
	 * Function to generate custom file names (transform-part-.....) for
	 * mappers' output for ApplyTfCSV job. The idea is to find the index 
	 * of (thisfile, fileoffset) in the list of all offsets from the 
	 * counters/offsets file, which was generated from either GenTfMtdMR
	 * or AssignRowIDMR job.
	 * 
	 * @param job job configuration
	 * @param offset file offset
	 * @return part file id (ie, 00001, 00002, etc)
	 * @throws IOException if IOException occurs
	 */
public String getPartFileID(JobConf job, long offset) throws IOException {
    Reader reader = null;
    int id = 0;
    try {
        reader = initOffsetsReader(job);
        ByteWritable key = new ByteWritable();
        OffsetCount value = new OffsetCount();
        String thisFile = TfUtils.getPartFileName(job);
        while (reader.next(key, value)) {
            if (thisFile.equals(value.filename) && value.fileOffset == offset)
                break;
            id++;
        }
    } finally {
        IOUtilFunctions.closeSilently(reader);
    }
    String sid = Integer.toString(id);
    char[] carr = new char[5 - sid.length()];
    Arrays.fill(carr, '0');
    String ret = (new String(carr)).concat(sid);
    return ret;
}
Also used : OffsetCount(org.apache.sysml.runtime.matrix.CSVReblockMR.OffsetCount) MatrixReader(org.apache.sysml.runtime.io.MatrixReader) Reader(org.apache.hadoop.io.SequenceFile.Reader) ByteWritable(org.apache.hadoop.io.ByteWritable)

Example 2 with Reader

use of org.apache.hadoop.io.SequenceFile.Reader in project nifi by apache.

the class KeyValueReader method readSequenceFile.

@Override
public Set<FlowFile> readSequenceFile(Path file, Configuration configuration, FileSystem fileSystem) throws IOException {
    final SequenceFile.Reader reader;
    Set<FlowFile> flowFiles = new HashSet<>();
    reader = new SequenceFile.Reader(configuration, Reader.file(fileSystem.makeQualified(file)));
    final Text key = new Text();
    final KeyValueWriterCallback callback = new KeyValueWriterCallback(reader);
    final String inputfileName = file.getName() + "." + System.nanoTime() + ".";
    int counter = 0;
    LOG.debug("Read from SequenceFile: {} ", new Object[] { file });
    try {
        while (reader.next(key)) {
            String fileName = key.toString();
            // the key may be a file name, and may not
            if (LOOKS_LIKE_FILENAME.matcher(fileName).matches()) {
                if (fileName.contains(File.separator)) {
                    fileName = StringUtils.substringAfterLast(fileName, File.separator);
                }
                fileName = fileName + "." + System.nanoTime();
            } else {
                fileName = inputfileName + ++counter;
            }
            FlowFile flowFile = session.create();
            flowFile = session.putAttribute(flowFile, CoreAttributes.FILENAME.key(), fileName);
            callback.key = key;
            try {
                flowFile = session.write(flowFile, callback);
                flowFiles.add(flowFile);
            } catch (ProcessException e) {
                LOG.error("Could not write to flowfile {}", new Object[] { flowFile }, e);
                session.remove(flowFile);
            }
            key.clear();
        }
    } finally {
        IOUtils.closeQuietly(reader);
    }
    return flowFiles;
}
Also used : FlowFile(org.apache.nifi.flowfile.FlowFile) ProcessException(org.apache.nifi.processor.exception.ProcessException) Reader(org.apache.hadoop.io.SequenceFile.Reader) SequenceFile(org.apache.hadoop.io.SequenceFile) Text(org.apache.hadoop.io.Text) HashSet(java.util.HashSet)

Example 3 with Reader

use of org.apache.hadoop.io.SequenceFile.Reader in project nifi by apache.

the class ValueReader method readSequenceFile.

@Override
public Set<FlowFile> readSequenceFile(final Path file, Configuration configuration, FileSystem fileSystem) throws IOException {
    Set<FlowFile> flowFiles = new HashSet<>();
    final SequenceFile.Reader reader = new SequenceFile.Reader(configuration, Reader.file(fileSystem.makeQualified(file)));
    final String inputfileName = file.getName() + "." + System.nanoTime() + ".";
    int counter = 0;
    LOG.debug("Reading from sequence file {}", new Object[] { file });
    final OutputStreamWritableCallback writer = new OutputStreamWritableCallback(reader);
    Text key = new Text();
    try {
        while (reader.next(key)) {
            String fileName = key.toString();
            // the key may be a file name, and may not
            if (LOOKS_LIKE_FILENAME.matcher(fileName).matches()) {
                if (fileName.contains(File.separator)) {
                    fileName = StringUtils.substringAfterLast(fileName, File.separator);
                }
                fileName = fileName + "." + System.nanoTime();
            } else {
                fileName = inputfileName + ++counter;
            }
            FlowFile flowFile = session.create();
            flowFile = session.putAttribute(flowFile, CoreAttributes.FILENAME.key(), fileName);
            try {
                flowFile = session.write(flowFile, writer);
                flowFiles.add(flowFile);
            } catch (ProcessException e) {
                LOG.error("Could not write to flowfile {}", new Object[] { flowFile }, e);
                session.remove(flowFile);
            }
            key.clear();
        }
    } finally {
        IOUtils.closeQuietly(reader);
    }
    return flowFiles;
}
Also used : FlowFile(org.apache.nifi.flowfile.FlowFile) ProcessException(org.apache.nifi.processor.exception.ProcessException) Reader(org.apache.hadoop.io.SequenceFile.Reader) SequenceFile(org.apache.hadoop.io.SequenceFile) Reader(org.apache.hadoop.io.SequenceFile.Reader) SequenceFileReader(org.apache.nifi.processors.hadoop.util.SequenceFileReader) Text(org.apache.hadoop.io.Text) HashSet(java.util.HashSet)

Example 4 with Reader

use of org.apache.hadoop.io.SequenceFile.Reader in project spark-dataflow by cloudera.

the class HadoopFileFormatPipelineTest method testSequenceFile.

@Test
public void testSequenceFile() throws Exception {
    populateFile();
    Pipeline p = Pipeline.create(PipelineOptionsFactory.create());
    @SuppressWarnings("unchecked") Class<? extends FileInputFormat<IntWritable, Text>> inputFormatClass = (Class<? extends FileInputFormat<IntWritable, Text>>) (Class<?>) SequenceFileInputFormat.class;
    HadoopIO.Read.Bound<IntWritable, Text> read = HadoopIO.Read.from(inputFile.getAbsolutePath(), inputFormatClass, IntWritable.class, Text.class);
    PCollection<KV<IntWritable, Text>> input = p.apply(read);
    @SuppressWarnings("unchecked") Class<? extends FileOutputFormat<IntWritable, Text>> outputFormatClass = (Class<? extends FileOutputFormat<IntWritable, Text>>) (Class<?>) TemplatedSequenceFileOutputFormat.class;
    @SuppressWarnings("unchecked") HadoopIO.Write.Bound<IntWritable, Text> write = HadoopIO.Write.to(outputFile.getAbsolutePath(), outputFormatClass, IntWritable.class, Text.class);
    input.apply(write.withoutSharding());
    EvaluationResult res = SparkPipelineRunner.create().run(p);
    res.close();
    IntWritable key = new IntWritable();
    Text value = new Text();
    try (Reader reader = new Reader(new Configuration(), Reader.file(new Path(outputFile.toURI())))) {
        int i = 0;
        while (reader.next(key, value)) {
            assertEquals(i, key.get());
            assertEquals("value-" + i, value.toString());
            i++;
        }
    }
}
Also used : FileOutputFormat(org.apache.hadoop.mapreduce.lib.output.FileOutputFormat) Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) SequenceFileInputFormat(org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat) Reader(org.apache.hadoop.io.SequenceFile.Reader) Text(org.apache.hadoop.io.Text) KV(com.google.cloud.dataflow.sdk.values.KV) SequenceFileInputFormat(org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat) FileInputFormat(org.apache.hadoop.mapreduce.lib.input.FileInputFormat) Pipeline(com.google.cloud.dataflow.sdk.Pipeline) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.junit.Test)

Example 5 with Reader

use of org.apache.hadoop.io.SequenceFile.Reader in project nutch by apache.

the class LinkReader method read.

@Override
public List read(String path) throws FileNotFoundException {
    List<HashMap> rows = new ArrayList<>();
    Path file = new Path(path);
    SequenceFile.Reader reader;
    try {
        reader = new SequenceFile.Reader(conf, Reader.file(file));
        Writable key = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
        LinkDatum value = new LinkDatum();
        while (reader.next(key, value)) {
            try {
                HashMap<String, String> t_row = getLinksRow(key, value);
                rows.add(t_row);
            } catch (Exception e) {
            }
        }
        reader.close();
    } catch (FileNotFoundException fne) {
        throw new FileNotFoundException();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
        LOG.error("Error occurred while reading file {} : ", file, StringUtils.stringifyException(e));
        throw new WebApplicationException();
    }
    return rows;
}
Also used : Path(org.apache.hadoop.fs.Path) Reader(org.apache.hadoop.io.SequenceFile.Reader) WebApplicationException(javax.ws.rs.WebApplicationException) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) FileNotFoundException(java.io.FileNotFoundException) Writable(org.apache.hadoop.io.Writable) LinkDatum(org.apache.nutch.scoring.webgraph.LinkDatum) IOException(java.io.IOException) IOException(java.io.IOException) FileNotFoundException(java.io.FileNotFoundException) WebApplicationException(javax.ws.rs.WebApplicationException) SequenceFile(org.apache.hadoop.io.SequenceFile)

Aggregations

Reader (org.apache.hadoop.io.SequenceFile.Reader)23 Path (org.apache.hadoop.fs.Path)18 SequenceFile (org.apache.hadoop.io.SequenceFile)16 IOException (java.io.IOException)14 Writable (org.apache.hadoop.io.Writable)13 FileNotFoundException (java.io.FileNotFoundException)12 WebApplicationException (javax.ws.rs.WebApplicationException)12 ArrayList (java.util.ArrayList)9 HashMap (java.util.HashMap)6 Node (org.apache.nutch.scoring.webgraph.Node)4 List (java.util.List)3 Text (org.apache.hadoop.io.Text)3 LinkDatum (org.apache.nutch.scoring.webgraph.LinkDatum)3 Test (org.junit.Test)3 HashSet (java.util.HashSet)2 Writer (org.apache.hadoop.io.SequenceFile.Writer)2 FlowFile (org.apache.nifi.flowfile.FlowFile)2 ProcessException (org.apache.nifi.processor.exception.ProcessException)2 MatrixReader (org.apache.sysml.runtime.io.MatrixReader)2 Pipeline (com.google.cloud.dataflow.sdk.Pipeline)1