Search in sources :

Example 91 with LongWritable

use of org.apache.hadoop.io.LongWritable in project incubator-gobblin by apache.

the class GobblinWorkUnitsInputFormatTest method testRecordReader.

@Test
public void testRecordReader() throws Exception {
    List<String> paths = Lists.newArrayList("/path1", "/path2");
    GobblinWorkUnitsInputFormat.GobblinSplit split = new GobblinWorkUnitsInputFormat.GobblinSplit(paths);
    GobblinWorkUnitsInputFormat inputFormat = new GobblinWorkUnitsInputFormat();
    RecordReader<LongWritable, Text> recordReader = inputFormat.createRecordReader(split, new TaskAttemptContextImpl(new Configuration(), new TaskAttemptID("a", 1, TaskType.MAP, 1, 1)));
    recordReader.nextKeyValue();
    Assert.assertEquals(recordReader.getCurrentKey().get(), 0);
    Assert.assertEquals(recordReader.getCurrentValue().toString(), "/path1");
    recordReader.nextKeyValue();
    Assert.assertEquals(recordReader.getCurrentKey().get(), 1);
    Assert.assertEquals(recordReader.getCurrentValue().toString(), "/path2");
    Assert.assertFalse(recordReader.nextKeyValue());
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) TaskAttemptID(org.apache.hadoop.mapreduce.TaskAttemptID) TaskAttemptContextImpl(org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl) Text(org.apache.hadoop.io.Text) LongWritable(org.apache.hadoop.io.LongWritable) Test(org.testng.annotations.Test)

Example 92 with LongWritable

use of org.apache.hadoop.io.LongWritable in project incubator-rya by apache.

the class CountPlan method reduce.

@Override
public void reduce(final IntermediateProspect prospect, final Iterable<LongWritable> counts, final Date timestamp, final Reducer.Context context) throws IOException, InterruptedException {
    long sum = 0;
    for (final LongWritable count : counts) {
        sum += count.get();
    }
    final String indexType = prospect.getTripleValueType().getIndexType();
    // not sure if this is the best idea..
    if ((sum >= 0) || indexType.equals(TripleValueType.PREDICATE.getIndexType())) {
        final Mutation m = new Mutation(indexType + DELIM + prospect.getData() + DELIM + ProspectorUtils.getReverseIndexDateTime(timestamp));
        final String dataType = prospect.getDataType();
        final ColumnVisibility visibility = new ColumnVisibility(prospect.getVisibility());
        final Value sumValue = new Value(("" + sum).getBytes(StandardCharsets.UTF_8));
        m.put(COUNT, prospect.getDataType(), visibility, timestamp.getTime(), sumValue);
        context.write(null, m);
    }
}
Also used : Value(org.apache.accumulo.core.data.Value) LongWritable(org.apache.hadoop.io.LongWritable) Mutation(org.apache.accumulo.core.data.Mutation) ColumnVisibility(org.apache.accumulo.core.security.ColumnVisibility)

Example 93 with LongWritable

use of org.apache.hadoop.io.LongWritable in project incubator-rya by apache.

the class CountPlan method map.

@Override
public Collection<Map.Entry<IntermediateProspect, LongWritable>> map(final RyaStatement ryaStatement) {
    final RyaURI subject = ryaStatement.getSubject();
    final RyaURI predicate = ryaStatement.getPredicate();
    final String subjpred = ryaStatement.getSubject().getData() + DELIM + ryaStatement.getPredicate().getData();
    final String predobj = ryaStatement.getPredicate().getData() + DELIM + ryaStatement.getObject().getData();
    final String subjobj = ryaStatement.getSubject().getData() + DELIM + ryaStatement.getObject().getData();
    final RyaType object = ryaStatement.getObject();
    final int localIndex = URIUtil.getLocalNameIndex(subject.getData());
    final String namespace = subject.getData().substring(0, localIndex - 1);
    final String visibility = new String(ryaStatement.getColumnVisibility(), StandardCharsets.UTF_8);
    final List<Map.Entry<IntermediateProspect, LongWritable>> entries = new ArrayList<>(7);
    // Create an entry for each TripleValueType type.
    entries.add(new CustomEntry<IntermediateProspect, LongWritable>(IntermediateProspect.builder().setIndex(COUNT).setData(subject.getData()).setDataType(URITYPE).setTripleValueType(TripleValueType.SUBJECT).setVisibility(visibility).build(), ONE));
    entries.add(new CustomEntry<IntermediateProspect, LongWritable>(IntermediateProspect.builder().setIndex(COUNT).setData(predicate.getData()).setDataType(URITYPE).setTripleValueType(TripleValueType.PREDICATE).setVisibility(visibility).build(), ONE));
    entries.add(new CustomEntry<IntermediateProspect, LongWritable>(IntermediateProspect.builder().setIndex(COUNT).setData(object.getData()).setDataType(object.getDataType().stringValue()).setTripleValueType(TripleValueType.OBJECT).setVisibility(visibility).build(), ONE));
    entries.add(new CustomEntry<IntermediateProspect, LongWritable>(IntermediateProspect.builder().setIndex(COUNT).setData(subjpred).setDataType(XMLSchema.STRING.toString()).setTripleValueType(TripleValueType.SUBJECT_PREDICATE).setVisibility(visibility).build(), ONE));
    entries.add(new CustomEntry<IntermediateProspect, LongWritable>(IntermediateProspect.builder().setIndex(COUNT).setData(subjobj).setDataType(XMLSchema.STRING.toString()).setTripleValueType(TripleValueType.SUBJECT_OBJECT).setVisibility(visibility).build(), ONE));
    entries.add(new CustomEntry<IntermediateProspect, LongWritable>(IntermediateProspect.builder().setIndex(COUNT).setData(predobj).setDataType(XMLSchema.STRING.toString()).setTripleValueType(TripleValueType.PREDICATE_OBJECT).setVisibility(visibility).build(), ONE));
    entries.add(new CustomEntry<IntermediateProspect, LongWritable>(IntermediateProspect.builder().setIndex(COUNT).setData(namespace).setDataType(URITYPE).setTripleValueType(TripleValueType.ENTITY).setVisibility(visibility).build(), ONE));
    return entries;
}
Also used : RyaURI(org.apache.rya.api.domain.RyaURI) IndexEntry(org.apache.rya.prospector.domain.IndexEntry) CustomEntry(org.apache.rya.prospector.utils.CustomEntry) Entry(java.util.Map.Entry) ArrayList(java.util.ArrayList) IntermediateProspect(org.apache.rya.prospector.domain.IntermediateProspect) LongWritable(org.apache.hadoop.io.LongWritable) RyaType(org.apache.rya.api.domain.RyaType)

Example 94 with LongWritable

use of org.apache.hadoop.io.LongWritable in project incubator-rya by apache.

the class DuplicateEliminationTest method testRdfMapperOutput.

@Test
public void testRdfMapperOutput() throws Exception {
    RyaStatement rya = TestUtils.ryaStatement("x", "subOrganizationOf", "y");
    RyaStatementWritable rsw = new RyaStatementWritable();
    rsw.setRyaStatement(rya);
    LongWritable l = new LongWritable();
    new MapDriver<LongWritable, RyaStatementWritable, Fact, Derivation>().withMapper(new DuplicateElimination.DuplicateRdfMapper()).withInput(l, rsw).withOutput(X_SUB_Y, X_SUB_Y.getDerivation()).runTest();
}
Also used : MapDriver(org.apache.hadoop.mrunit.mapreduce.MapDriver) RyaStatementWritable(org.apache.rya.accumulo.mr.RyaStatementWritable) RyaStatement(org.apache.rya.api.domain.RyaStatement) LongWritable(org.apache.hadoop.io.LongWritable) PrepareForTest(org.powermock.core.classloader.annotations.PrepareForTest) Test(org.junit.Test)

Example 95 with LongWritable

use of org.apache.hadoop.io.LongWritable in project cdap by caskdata.

the class SparkLogParser method run.

@Override
public void run(JavaSparkExecutionContext sec) throws Exception {
    JavaSparkContext jsc = new JavaSparkContext();
    Map<String, String> runtimeArguments = sec.getRuntimeArguments();
    String inputFileSet = runtimeArguments.get("input");
    final String outputTable = runtimeArguments.get("output");
    JavaPairRDD<LongWritable, Text> input = sec.fromDataset(inputFileSet);
    final JavaPairRDD<String, String> aggregated = input.mapToPair(new PairFunction<Tuple2<LongWritable, Text>, LogKey, LogStats>() {

        @Override
        public Tuple2<LogKey, LogStats> call(Tuple2<LongWritable, Text> input) throws Exception {
            return SparkAppUsingGetDataset.parse(input._2());
        }
    }).reduceByKey(new Function2<LogStats, LogStats, LogStats>() {

        @Override
        public LogStats call(LogStats stats1, LogStats stats2) throws Exception {
            return stats1.aggregate(stats2);
        }
    }).mapPartitionsToPair(new PairFlatMapFunction<Iterator<Tuple2<LogKey, LogStats>>, String, String>() {

        @Override
        public Iterable<Tuple2<String, String>> call(Iterator<Tuple2<LogKey, LogStats>> itor) throws Exception {
            final Gson gson = new Gson();
            return Lists.newArrayList(Iterators.transform(itor, new Function<Tuple2<LogKey, LogStats>, Tuple2<String, String>>() {

                @Override
                public Tuple2<String, String> apply(Tuple2<LogKey, LogStats> input) {
                    return new Tuple2<>(gson.toJson(input._1()), gson.toJson(input._2()));
                }
            }));
        }
    });
    // Collect all data to driver and write to dataset directly. That's the intend of the test.
    sec.execute(new TxRunnable() {

        @Override
        public void run(DatasetContext context) throws Exception {
            KeyValueTable kvTable = context.getDataset(outputTable);
            for (Map.Entry<String, String> entry : aggregated.collectAsMap().entrySet()) {
                kvTable.write(entry.getKey(), entry.getValue());
            }
        }
    });
}
Also used : Gson(com.google.gson.Gson) TxRunnable(co.cask.cdap.api.TxRunnable) Iterator(java.util.Iterator) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) LongWritable(org.apache.hadoop.io.LongWritable) DatasetContext(co.cask.cdap.api.data.DatasetContext) LogKey(co.cask.cdap.spark.app.SparkAppUsingGetDataset.LogKey) Text(org.apache.hadoop.io.Text) Function2(org.apache.spark.api.java.function.Function2) LogStats(co.cask.cdap.spark.app.SparkAppUsingGetDataset.LogStats) Tuple2(scala.Tuple2) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable)

Aggregations

LongWritable (org.apache.hadoop.io.LongWritable)445 Text (org.apache.hadoop.io.Text)220 Test (org.junit.Test)171 IntWritable (org.apache.hadoop.io.IntWritable)102 Path (org.apache.hadoop.fs.Path)99 BytesWritable (org.apache.hadoop.io.BytesWritable)70 FloatWritable (org.apache.hadoop.io.FloatWritable)68 Configuration (org.apache.hadoop.conf.Configuration)62 DoubleWritable (org.apache.hadoop.hive.serde2.io.DoubleWritable)62 BooleanWritable (org.apache.hadoop.io.BooleanWritable)60 ArrayList (java.util.ArrayList)59 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)57 ShortWritable (org.apache.hadoop.hive.serde2.io.ShortWritable)53 IOException (java.io.IOException)49 ByteWritable (org.apache.hadoop.hive.serde2.io.ByteWritable)48 SequenceFile (org.apache.hadoop.io.SequenceFile)42 HiveDecimalWritable (org.apache.hadoop.hive.serde2.io.HiveDecimalWritable)40 FileSystem (org.apache.hadoop.fs.FileSystem)37 JobConf (org.apache.hadoop.mapred.JobConf)37 DeferredObject (org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredObject)35