use of org.apache.hadoop.io.LongWritable in project incubator-gobblin by apache.
the class GobblinWorkUnitsInputFormatTest method testRecordReader.
@Test
public void testRecordReader() throws Exception {
List<String> paths = Lists.newArrayList("/path1", "/path2");
GobblinWorkUnitsInputFormat.GobblinSplit split = new GobblinWorkUnitsInputFormat.GobblinSplit(paths);
GobblinWorkUnitsInputFormat inputFormat = new GobblinWorkUnitsInputFormat();
RecordReader<LongWritable, Text> recordReader = inputFormat.createRecordReader(split, new TaskAttemptContextImpl(new Configuration(), new TaskAttemptID("a", 1, TaskType.MAP, 1, 1)));
recordReader.nextKeyValue();
Assert.assertEquals(recordReader.getCurrentKey().get(), 0);
Assert.assertEquals(recordReader.getCurrentValue().toString(), "/path1");
recordReader.nextKeyValue();
Assert.assertEquals(recordReader.getCurrentKey().get(), 1);
Assert.assertEquals(recordReader.getCurrentValue().toString(), "/path2");
Assert.assertFalse(recordReader.nextKeyValue());
}
use of org.apache.hadoop.io.LongWritable in project incubator-rya by apache.
the class CountPlan method reduce.
@Override
public void reduce(final IntermediateProspect prospect, final Iterable<LongWritable> counts, final Date timestamp, final Reducer.Context context) throws IOException, InterruptedException {
long sum = 0;
for (final LongWritable count : counts) {
sum += count.get();
}
final String indexType = prospect.getTripleValueType().getIndexType();
// not sure if this is the best idea..
if ((sum >= 0) || indexType.equals(TripleValueType.PREDICATE.getIndexType())) {
final Mutation m = new Mutation(indexType + DELIM + prospect.getData() + DELIM + ProspectorUtils.getReverseIndexDateTime(timestamp));
final String dataType = prospect.getDataType();
final ColumnVisibility visibility = new ColumnVisibility(prospect.getVisibility());
final Value sumValue = new Value(("" + sum).getBytes(StandardCharsets.UTF_8));
m.put(COUNT, prospect.getDataType(), visibility, timestamp.getTime(), sumValue);
context.write(null, m);
}
}
use of org.apache.hadoop.io.LongWritable in project incubator-rya by apache.
the class CountPlan method map.
@Override
public Collection<Map.Entry<IntermediateProspect, LongWritable>> map(final RyaStatement ryaStatement) {
final RyaURI subject = ryaStatement.getSubject();
final RyaURI predicate = ryaStatement.getPredicate();
final String subjpred = ryaStatement.getSubject().getData() + DELIM + ryaStatement.getPredicate().getData();
final String predobj = ryaStatement.getPredicate().getData() + DELIM + ryaStatement.getObject().getData();
final String subjobj = ryaStatement.getSubject().getData() + DELIM + ryaStatement.getObject().getData();
final RyaType object = ryaStatement.getObject();
final int localIndex = URIUtil.getLocalNameIndex(subject.getData());
final String namespace = subject.getData().substring(0, localIndex - 1);
final String visibility = new String(ryaStatement.getColumnVisibility(), StandardCharsets.UTF_8);
final List<Map.Entry<IntermediateProspect, LongWritable>> entries = new ArrayList<>(7);
// Create an entry for each TripleValueType type.
entries.add(new CustomEntry<IntermediateProspect, LongWritable>(IntermediateProspect.builder().setIndex(COUNT).setData(subject.getData()).setDataType(URITYPE).setTripleValueType(TripleValueType.SUBJECT).setVisibility(visibility).build(), ONE));
entries.add(new CustomEntry<IntermediateProspect, LongWritable>(IntermediateProspect.builder().setIndex(COUNT).setData(predicate.getData()).setDataType(URITYPE).setTripleValueType(TripleValueType.PREDICATE).setVisibility(visibility).build(), ONE));
entries.add(new CustomEntry<IntermediateProspect, LongWritable>(IntermediateProspect.builder().setIndex(COUNT).setData(object.getData()).setDataType(object.getDataType().stringValue()).setTripleValueType(TripleValueType.OBJECT).setVisibility(visibility).build(), ONE));
entries.add(new CustomEntry<IntermediateProspect, LongWritable>(IntermediateProspect.builder().setIndex(COUNT).setData(subjpred).setDataType(XMLSchema.STRING.toString()).setTripleValueType(TripleValueType.SUBJECT_PREDICATE).setVisibility(visibility).build(), ONE));
entries.add(new CustomEntry<IntermediateProspect, LongWritable>(IntermediateProspect.builder().setIndex(COUNT).setData(subjobj).setDataType(XMLSchema.STRING.toString()).setTripleValueType(TripleValueType.SUBJECT_OBJECT).setVisibility(visibility).build(), ONE));
entries.add(new CustomEntry<IntermediateProspect, LongWritable>(IntermediateProspect.builder().setIndex(COUNT).setData(predobj).setDataType(XMLSchema.STRING.toString()).setTripleValueType(TripleValueType.PREDICATE_OBJECT).setVisibility(visibility).build(), ONE));
entries.add(new CustomEntry<IntermediateProspect, LongWritable>(IntermediateProspect.builder().setIndex(COUNT).setData(namespace).setDataType(URITYPE).setTripleValueType(TripleValueType.ENTITY).setVisibility(visibility).build(), ONE));
return entries;
}
use of org.apache.hadoop.io.LongWritable in project incubator-rya by apache.
the class DuplicateEliminationTest method testRdfMapperOutput.
@Test
public void testRdfMapperOutput() throws Exception {
RyaStatement rya = TestUtils.ryaStatement("x", "subOrganizationOf", "y");
RyaStatementWritable rsw = new RyaStatementWritable();
rsw.setRyaStatement(rya);
LongWritable l = new LongWritable();
new MapDriver<LongWritable, RyaStatementWritable, Fact, Derivation>().withMapper(new DuplicateElimination.DuplicateRdfMapper()).withInput(l, rsw).withOutput(X_SUB_Y, X_SUB_Y.getDerivation()).runTest();
}
use of org.apache.hadoop.io.LongWritable in project cdap by caskdata.
the class SparkLogParser method run.
@Override
public void run(JavaSparkExecutionContext sec) throws Exception {
JavaSparkContext jsc = new JavaSparkContext();
Map<String, String> runtimeArguments = sec.getRuntimeArguments();
String inputFileSet = runtimeArguments.get("input");
final String outputTable = runtimeArguments.get("output");
JavaPairRDD<LongWritable, Text> input = sec.fromDataset(inputFileSet);
final JavaPairRDD<String, String> aggregated = input.mapToPair(new PairFunction<Tuple2<LongWritable, Text>, LogKey, LogStats>() {
@Override
public Tuple2<LogKey, LogStats> call(Tuple2<LongWritable, Text> input) throws Exception {
return SparkAppUsingGetDataset.parse(input._2());
}
}).reduceByKey(new Function2<LogStats, LogStats, LogStats>() {
@Override
public LogStats call(LogStats stats1, LogStats stats2) throws Exception {
return stats1.aggregate(stats2);
}
}).mapPartitionsToPair(new PairFlatMapFunction<Iterator<Tuple2<LogKey, LogStats>>, String, String>() {
@Override
public Iterable<Tuple2<String, String>> call(Iterator<Tuple2<LogKey, LogStats>> itor) throws Exception {
final Gson gson = new Gson();
return Lists.newArrayList(Iterators.transform(itor, new Function<Tuple2<LogKey, LogStats>, Tuple2<String, String>>() {
@Override
public Tuple2<String, String> apply(Tuple2<LogKey, LogStats> input) {
return new Tuple2<>(gson.toJson(input._1()), gson.toJson(input._2()));
}
}));
}
});
// Collect all data to driver and write to dataset directly. That's the intend of the test.
sec.execute(new TxRunnable() {
@Override
public void run(DatasetContext context) throws Exception {
KeyValueTable kvTable = context.getDataset(outputTable);
for (Map.Entry<String, String> entry : aggregated.collectAsMap().entrySet()) {
kvTable.write(entry.getKey(), entry.getValue());
}
}
});
}
Aggregations