Search in sources :

Example 6 with DataBag

use of org.apache.pig.data.DataBag in project mongo-hadoop by mongodb.

the class BSONLoader method readField.

/**
     * Convert an object from a MongoDB document into a type that Pig can
     * understand, based on the expectations of the given schema.
     * @param obj object from a MongoDB document
     * @param field the schema describing this field
     * @return an object appropriate for Pig
     * @throws IOException
     */
@SuppressWarnings({ "rawtypes", "unchecked" })
protected static Object readField(final Object obj, final ResourceFieldSchema field) throws IOException {
    if (obj == null) {
        return null;
    }
    try {
        if (field == null) {
            return obj;
        }
        switch(field.getType()) {
            case DataType.INTEGER:
                return Integer.parseInt(obj.toString());
            case DataType.LONG:
                return Long.parseLong(obj.toString());
            case DataType.FLOAT:
                return Float.parseFloat(obj.toString());
            case DataType.DOUBLE:
                return Double.parseDouble(obj.toString());
            case DataType.BYTEARRAY:
                return BSONLoader.convertBSONtoPigType(obj);
            case DataType.CHARARRAY:
                return obj.toString();
            case DataType.DATETIME:
                return new DateTime(obj);
            case DataType.TUPLE:
                ResourceSchema s = field.getSchema();
                ResourceFieldSchema[] fs = s.getFields();
                Tuple t = tupleFactory.newTuple(fs.length);
                BasicDBObject val = (BasicDBObject) obj;
                for (int j = 0; j < fs.length; j++) {
                    t.set(j, readField(val.get(fs[j].getName()), fs[j]));
                }
                return t;
            case DataType.BAG:
                s = field.getSchema();
                fs = s.getFields();
                s = fs[0].getSchema();
                fs = s.getFields();
                DataBag bag = bagFactory.newDefaultBag();
                BasicDBList vals = (BasicDBList) obj;
                for (Object val1 : vals) {
                    t = tupleFactory.newTuple(fs.length);
                    for (int k = 0; k < fs.length; k++) {
                        t.set(k, readField(((BasicDBObject) val1).get(fs[k].getName()), fs[k]));
                    }
                    bag.add(t);
                }
                return bag;
            case DataType.MAP:
                s = field.getSchema();
                fs = s != null ? s.getFields() : null;
                Map outputMap = new HashMap();
                if (obj instanceof BSONObject) {
                    BasicBSONObject inputMap = (BasicBSONObject) obj;
                    for (String key : inputMap.keySet()) {
                        if (fs != null) {
                            outputMap.put(key, readField(inputMap.get(key), fs[0]));
                        } else {
                            outputMap.put(key, readField(inputMap.get(key), null));
                        }
                    }
                } else if (obj instanceof DBRef) {
                    DBRef ref = (DBRef) obj;
                    outputMap.put("$ref", ref.getCollectionName());
                    outputMap.put("$id", ref.getId().toString());
                }
                return outputMap;
            default:
                LOG.info("asfkjabskfjbsaf default for " + field.getName());
                return BSONLoader.convertBSONtoPigType(obj);
        }
    } catch (Exception e) {
        String fieldName = field.getName() == null ? "" : field.getName();
        String type = DataType.genTypeToNameMap().get(field.getType());
        LOG.warn("Type " + type + " for field " + fieldName + " can not be applied to " + obj.getClass().toString());
        return null;
    }
}
Also used : ResourceSchema(org.apache.pig.ResourceSchema) DataBag(org.apache.pig.data.DataBag) HashMap(java.util.HashMap) BasicBSONObject(org.bson.BasicBSONObject) BSONObject(org.bson.BSONObject) DBRef(com.mongodb.DBRef) DateTime(org.joda.time.DateTime) ExecException(org.apache.pig.backend.executionengine.ExecException) IOException(java.io.IOException) BasicDBObject(com.mongodb.BasicDBObject) BasicDBList(com.mongodb.BasicDBList) BasicBSONObject(org.bson.BasicBSONObject) ResourceFieldSchema(org.apache.pig.ResourceSchema.ResourceFieldSchema) BasicBSONObject(org.bson.BasicBSONObject) BasicDBObject(com.mongodb.BasicDBObject) BSONObject(org.bson.BSONObject) HashMap(java.util.HashMap) Map(java.util.Map) Tuple(org.apache.pig.data.Tuple)

Example 7 with DataBag

use of org.apache.pig.data.DataBag in project mongo-hadoop by mongodb.

the class BSONStorage method getTypeForBSON.

/**
     * Returns object more suited for BSON storage. Object o corresponds to a field value in pig.
     *
     * @param o        object representing pig type to convert to BSON-like object
     * @param field    field to place o in
     * @param toIgnore name of field in Object o to ignore
     * @return an Object that can be stored as BSON.
     * @throws IOException if no schema is available from the field
     */
public static Object getTypeForBSON(final Object o, final ResourceFieldSchema field, final String toIgnore) throws IOException {
    byte dataType;
    ResourceSchema fieldInnerSchema = null;
    if (null == o) {
        return null;
    }
    if (null == field || DataType.UNKNOWN == field.getType()) {
        dataType = DataType.findType(o);
    } else {
        dataType = field.getType();
        fieldInnerSchema = field.getSchema();
    }
    if (dataType == DataType.BYTEARRAY && o instanceof Map) {
        dataType = DataType.MAP;
    }
    switch(dataType) {
        case DataType.NULL:
            return null;
        case DataType.INTEGER:
        case DataType.LONG:
        case DataType.FLOAT:
        case DataType.DOUBLE:
            return o;
        case DataType.BYTEARRAY:
            if (o instanceof PigBoxedBSONValue) {
                return ((PigBoxedBSONValue) o).getObject();
            }
            return o.toString();
        case DataType.CHARARRAY:
            return o;
        case DataType.DATETIME:
            return ((DateTime) o).toDate();
        //Given a TUPLE, create a Map so BSONEncoder will eat it
        case DataType.TUPLE:
            // BasicBSONEncoder will consume it as an Iterable.
            if (fieldInnerSchema == null) {
                return o;
            }
            // If there was an inner schema, create a Map from the Tuple.
            ResourceFieldSchema[] fs = fieldInnerSchema.getFields();
            // a bag should be ignored
            if (1 == fs.length && fs[0].getName().equals(toIgnore)) {
                return getTypeForBSON(((Tuple) o).get(0), fs[0], toIgnore);
            }
            // If there is more than one field in the tuple or no fields
            // to ignore, treat the Tuple as a Map.
            Map<String, Object> m = new LinkedHashMap<String, Object>();
            for (int j = 0; j < fs.length; j++) {
                m.put(fs[j].getName(), getTypeForBSON(((Tuple) o).get(j), fs[j], toIgnore));
            }
            return m;
        // Given a BAG, create an Array so BSONEncoder will eat it.
        case DataType.BAG:
            // BasicBSONEncoder will consume it as an Iterable.
            if (null == fieldInnerSchema) {
                return o;
            }
            fs = fieldInnerSchema.getFields();
            ArrayList<Object> bagList = new ArrayList<Object>();
            for (Tuple t : (DataBag) o) {
                bagList.add(getTypeForBSON(t, fs[0], toIgnore));
            }
            return bagList;
        case DataType.MAP:
            if (o == null) {
                return null;
            }
            Map map = (Map) o;
            Map<String, Object> out = new HashMap<String, Object>(map.size());
            for (Object key : map.keySet()) {
                out.put(key.toString(), getTypeForBSON(map.get(key), null, toIgnore));
            }
            return out;
        default:
            return o;
    }
}
Also used : ResourceSchema(org.apache.pig.ResourceSchema) DataBag(org.apache.pig.data.DataBag) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) ArrayList(java.util.ArrayList) DateTime(org.joda.time.DateTime) LinkedHashMap(java.util.LinkedHashMap) PigBoxedBSONValue(com.mongodb.hadoop.pig.udf.types.PigBoxedBSONValue) ResourceFieldSchema(org.apache.pig.ResourceSchema.ResourceFieldSchema) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map) Tuple(org.apache.pig.data.Tuple)

Example 8 with DataBag

use of org.apache.pig.data.DataBag in project mongo-hadoop by mongodb.

the class JSONPigReplaceTest method testSimpleMultipleReplace.

@Test
public void testSimpleMultipleReplace() throws Exception {
    // create tuple ({("Daniel", "Alabi")}, "Carleton College")
    // with schema 'b:{b:(f:chararray,l:chararray)}, s:chararray'
    Tuple t1 = tupleFactory.newTuple(2);
    t1.set(0, "Daniel");
    t1.set(1, "Alabi");
    DataBag b = bagFactory.newDefaultBag();
    b.add(t1);
    Tuple t = tupleFactory.newTuple(2);
    t.set(0, b);
    t.set(1, "Carleton College");
    JSONPigReplace j = new JSONPigReplace(new String[] { "{first:'$f', last:'$l', school:'$s'}", "{$push : {schools: '$s'}}" });
    BasicBSONObject[] bs = j.substitute(t, "b:{t:(f:chararray,l:chararray)}, s:chararray", null);
    assertNotNull(bs);
    assertTrue(bs.length == 2);
    // should produce
    // { "first" : "Daniel" , "last" : "Alabi" , "school" : "Carleton College"}
    // and
    // { "$push" : { "schools" : "Carleton College"}}
    BasicBSONObject res1 = bs[0];
    BasicBSONObject res2 = bs[1];
    assertEquals(res1.get("first"), "Daniel");
    assertEquals(res1.get("last"), "Alabi");
    assertEquals(((BasicBSONObject) res2.get("$push")).get("schools"), "Carleton College");
}
Also used : BasicBSONObject(org.bson.BasicBSONObject) DataBag(org.apache.pig.data.DataBag) Tuple(org.apache.pig.data.Tuple) Test(org.junit.Test)

Example 9 with DataBag

use of org.apache.pig.data.DataBag in project mongo-hadoop by mongodb.

the class JSONPigReplaceTest method testNamedArrayReplace.

@Test
public void testNamedArrayReplace() throws Exception {
    // create tuple ({("a"), ("b"), ("c")}) 
    // with schema 'cars:{f:(t:chararray)}'
    DataBag b = bagFactory.newDefaultBag();
    b.add(tupleFactory.newTuple("a"));
    b.add(tupleFactory.newTuple("b"));
    b.add(tupleFactory.newTuple("c"));
    JSONPigReplace j = new JSONPigReplace(new String[] { "{days : [1,2,3], age : 19, cars : '$cars'}" });
    BasicBSONObject[] bs = j.substitute(tupleFactory.newTuple(b), "cars : {f:(t:chararray)}", null);
    assertNotNull(bs);
    assertTrue(bs.length == 1);
    // should produce BSONObject
    // { "days" : [ 1 , 2 , 3] , "age" : 19 , "cars" : [ { "t" : "a"} , { "t" : "b"} , { "t" : "c"}]}
    BasicBSONObject res = bs[0];
    ArrayList cars = (ArrayList) res.get("cars");
    assertEquals(cars.size(), 3);
    Object o = cars.get(0);
    assertEquals(((Map) o).get("t"), "a");
}
Also used : BasicBSONObject(org.bson.BasicBSONObject) DataBag(org.apache.pig.data.DataBag) ArrayList(java.util.ArrayList) BasicBSONObject(org.bson.BasicBSONObject) Test(org.junit.Test)

Example 10 with DataBag

use of org.apache.pig.data.DataBag in project mongo-hadoop by mongodb.

the class MongoLoaderTest method testSimpleBag.

@Test
public void testSimpleBag() throws IOException {
    String userSchema = "b:{t:tuple(t1:chararray, t2:chararray)}";
    BasicDBList bag = new BasicDBList();
    bag.add(new BasicDBObject().append("t1", "t11_value").append("t2", "t12_value"));
    bag.add(new BasicDBObject().append("t1", "t21_value").append("t2", "t22_value"));
    MongoLoader ml = new MongoLoader(userSchema);
    Object result = BSONLoader.readField(bag, ml.getFields()[0]);
    DataBag b = (DataBag) result;
    Iterator<Tuple> bit = b.iterator();
    Tuple firstInnerT = bit.next();
    assertEquals(2, firstInnerT.size());
    assertEquals("t11_value", firstInnerT.get(0));
    assertEquals("t12_value", firstInnerT.get(1));
    Tuple secondInnerT = bit.next();
    assertEquals(2, secondInnerT.size());
    assertEquals("t21_value", secondInnerT.get(0));
    assertEquals("t22_value", secondInnerT.get(1));
    assertFalse(bit.hasNext());
}
Also used : BasicDBList(com.mongodb.BasicDBList) BasicDBObject(com.mongodb.BasicDBObject) DataBag(org.apache.pig.data.DataBag) BasicDBObject(com.mongodb.BasicDBObject) Tuple(org.apache.pig.data.Tuple) Test(org.junit.Test)

Aggregations

DataBag (org.apache.pig.data.DataBag)32 Tuple (org.apache.pig.data.Tuple)27 Test (org.junit.Test)10 Map (java.util.Map)7 IOException (java.io.IOException)6 HashMap (java.util.HashMap)6 BasicBSONObject (org.bson.BasicBSONObject)6 ArrayList (java.util.ArrayList)5 BasicDBList (com.mongodb.BasicDBList)3 BasicDBObject (com.mongodb.BasicDBObject)3 List (java.util.List)3 Properties (java.util.Properties)3 DefaultDataBag (org.apache.pig.data.DefaultDataBag)3 UDFContext (org.apache.pig.impl.util.UDFContext)3 DateTime (org.joda.time.DateTime)3 HCatFieldSchema (org.apache.hive.hcatalog.data.schema.HCatFieldSchema)2 ResourceSchema (org.apache.pig.ResourceSchema)2 ResourceFieldSchema (org.apache.pig.ResourceSchema.ResourceFieldSchema)2 DefaultTuple (org.apache.pig.data.DefaultTuple)2 ParallelTopicModel (cc.mallet.topics.ParallelTopicModel)1