Search in sources :

Example 1 with Pair

use of org.apache.crunch.Pair in project crunch by cloudera.

the class MemPipeline method write.

@Override
public void write(PCollection<?> collection, Target target) {
    if (target instanceof PathTarget) {
        Path path = ((PathTarget) target).getPath();
        try {
            FileSystem fs = FileSystem.get(conf);
            FSDataOutputStream os = fs.create(new Path(path, "out"));
            if (collection instanceof PTable) {
                for (Object o : collection.materialize()) {
                    Pair p = (Pair) o;
                    os.writeBytes(p.first().toString());
                    os.writeBytes("\t");
                    os.writeBytes(p.second().toString());
                    os.writeBytes("\r\n");
                }
            } else {
                for (Object o : collection.materialize()) {
                    os.writeBytes(o.toString() + "\r\n");
                }
            }
            os.close();
        } catch (IOException e) {
            LOG.error("Exception writing target: " + target, e);
        }
    } else {
        LOG.error("Target " + target + " is not a PathTarget instance");
    }
}
Also used : Path(org.apache.hadoop.fs.Path) PathTarget(org.apache.crunch.io.PathTarget) FileSystem(org.apache.hadoop.fs.FileSystem) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) IOException(java.io.IOException) PTable(org.apache.crunch.PTable) Pair(org.apache.crunch.Pair)

Example 2 with Pair

use of org.apache.crunch.Pair in project crunch by cloudera.

the class MapsideJoinTest method testMapsideJoin_RightSideIsEmpty.

@Test
public void testMapsideJoin_RightSideIsEmpty() throws IOException {
    MRPipeline pipeline = new MRPipeline(MapsideJoinTest.class);
    PTable<Integer, String> customerTable = readTable(pipeline, "customers.txt");
    PTable<Integer, String> orderTable = readTable(pipeline, "orders.txt");
    PTable<Integer, String> filteredOrderTable = orderTable.parallelDo(new NegativeFilter(), orderTable.getPTableType());
    PTable<Integer, Pair<String, String>> joined = MapsideJoin.join(customerTable, filteredOrderTable);
    List<Pair<Integer, Pair<String, String>>> materializedJoin = Lists.newArrayList(joined.materialize());
    assertTrue(materializedJoin.isEmpty());
}
Also used : MRPipeline(org.apache.crunch.impl.mr.MRPipeline) Pair(org.apache.crunch.Pair) Test(org.junit.Test)

Example 3 with Pair

use of org.apache.crunch.Pair in project crunch by cloudera.

the class AvrosTest method testPairs.

@Test
public void testPairs() throws Exception {
    AvroType<Pair<String, String>> at = Avros.pairs(Avros.strings(), Avros.strings());
    Pair<String, String> j = Pair.of("a", "b");
    GenericData.Record w = new GenericData.Record(at.getSchema());
    w.put(0, new Utf8("a"));
    w.put(1, new Utf8("b"));
    testInputOutputFn(at, j, w);
}
Also used : Utf8(org.apache.avro.util.Utf8) GenericData(org.apache.avro.generic.GenericData) Pair(org.apache.crunch.Pair) Test(org.junit.Test)

Example 4 with Pair

use of org.apache.crunch.Pair in project crunch by cloudera.

the class Sort method sortQuads.

/**
   * Sorts the {@link PCollection} of {@link Tuple4}s using the specified column
   * ordering.
   * 
   * @return a {@link PCollection} representing the sorted collection.
   */
public static <V1, V2, V3, V4> PCollection<Tuple4<V1, V2, V3, V4>> sortQuads(PCollection<Tuple4<V1, V2, V3, V4>> collection, ColumnOrder... columnOrders) {
    PTypeFamily tf = collection.getTypeFamily();
    PType<Tuple4<V1, V2, V3, V4>> pType = collection.getPType();
    @SuppressWarnings("unchecked") PTableType<Tuple4<V1, V2, V3, V4>, Void> type = tf.tableOf(tf.quads(pType.getSubTypes().get(0), pType.getSubTypes().get(1), pType.getSubTypes().get(2), pType.getSubTypes().get(3)), tf.nulls());
    PTable<Tuple4<V1, V2, V3, V4>, Void> pt = collection.parallelDo(new DoFn<Tuple4<V1, V2, V3, V4>, Pair<Tuple4<V1, V2, V3, V4>, Void>>() {

        @Override
        public void process(Tuple4<V1, V2, V3, V4> input, Emitter<Pair<Tuple4<V1, V2, V3, V4>, Void>> emitter) {
            emitter.emit(Pair.of(input, (Void) null));
        }
    }, type);
    Configuration conf = collection.getPipeline().getConfiguration();
    GroupingOptions options = buildGroupingOptions(conf, tf, pType, columnOrders);
    PTable<Tuple4<V1, V2, V3, V4>, Void> sortedPt = pt.groupByKey(options).ungroup();
    return sortedPt.parallelDo(new DoFn<Pair<Tuple4<V1, V2, V3, V4>, Void>, Tuple4<V1, V2, V3, V4>>() {

        @Override
        public void process(Pair<Tuple4<V1, V2, V3, V4>, Void> input, Emitter<Tuple4<V1, V2, V3, V4>> emitter) {
            emitter.emit(input.first());
        }
    }, collection.getPType());
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) Tuple4(org.apache.crunch.Tuple4) PTypeFamily(org.apache.crunch.types.PTypeFamily) GroupingOptions(org.apache.crunch.GroupingOptions) Pair(org.apache.crunch.Pair)

Example 5 with Pair

use of org.apache.crunch.Pair in project crunch by cloudera.

the class MapsideJoin method join.

/**
   * Join two tables using a map side join. The right-side table will be loaded
   * fully in memory, so this method should only be used if the right side
   * table's contents can fit in the memory allocated to mappers. The join
   * performed by this method is an inner join.
   * 
   * @param left
   *          The left-side table of the join
   * @param right
   *          The right-side table of the join, whose contents will be fully
   *          read into memory
   * @return A table keyed on the join key, containing pairs of joined values
   */
public static <K, U, V> PTable<K, Pair<U, V>> join(PTable<K, U> left, PTable<K, V> right) {
    if (!(right.getPipeline() instanceof MRPipeline)) {
        throw new CrunchRuntimeException("Map-side join is only supported within a MapReduce context");
    }
    MRPipeline pipeline = (MRPipeline) right.getPipeline();
    pipeline.materialize(right);
    // TODO Move necessary logic to MRPipeline so that we can theoretically
    // optimize his by running the setup of multiple map-side joins concurrently
    pipeline.run();
    ReadableSourceTarget<Pair<K, V>> readableSourceTarget = pipeline.getMaterializeSourceTarget(right);
    if (!(readableSourceTarget instanceof SourcePathTargetImpl)) {
        throw new CrunchRuntimeException("Right-side contents can't be read from a path");
    }
    // Suppress warnings because we've just checked this cast via instanceof
    @SuppressWarnings("unchecked") SourcePathTargetImpl<Pair<K, V>> sourcePathTarget = (SourcePathTargetImpl<Pair<K, V>>) readableSourceTarget;
    Path path = sourcePathTarget.getPath();
    DistributedCache.addCacheFile(path.toUri(), pipeline.getConfiguration());
    MapsideJoinDoFn<K, U, V> mapJoinDoFn = new MapsideJoinDoFn<K, U, V>(path.toString(), right.getPType());
    PTypeFamily typeFamily = left.getTypeFamily();
    return left.parallelDo("mapjoin", mapJoinDoFn, typeFamily.tableOf(left.getKeyType(), typeFamily.pairs(left.getValueType(), right.getValueType())));
}
Also used : Path(org.apache.hadoop.fs.Path) MRPipeline(org.apache.crunch.impl.mr.MRPipeline) PTypeFamily(org.apache.crunch.types.PTypeFamily) SourcePathTargetImpl(org.apache.crunch.io.impl.SourcePathTargetImpl) CrunchRuntimeException(org.apache.crunch.impl.mr.run.CrunchRuntimeException) Pair(org.apache.crunch.Pair)

Aggregations

Pair (org.apache.crunch.Pair)22 PTypeFamily (org.apache.crunch.types.PTypeFamily)15 GroupingOptions (org.apache.crunch.GroupingOptions)6 Configuration (org.apache.hadoop.conf.Configuration)5 MRPipeline (org.apache.crunch.impl.mr.MRPipeline)4 Test (org.junit.Test)4 Collection (java.util.Collection)3 CombineFn (org.apache.crunch.CombineFn)3 Emitter (org.apache.crunch.Emitter)3 PCollection (org.apache.crunch.PCollection)3 Utf8 (org.apache.avro.util.Utf8)2 DoFn (org.apache.crunch.DoFn)2 Pipeline (org.apache.crunch.Pipeline)2 Tuple3 (org.apache.crunch.Tuple3)2 Path (org.apache.hadoop.fs.Path)2 DatasetRepository (com.cloudera.cdk.data.DatasetRepository)1 StandardEvent (com.cloudera.cdk.data.event.StandardEvent)1 FileSystemDatasetRepository (com.cloudera.cdk.data.filesystem.FileSystemDatasetRepository)1 Session (com.cloudera.cdk.examples.demo.event.Session)1 IOException (java.io.IOException)1