use of org.apache.pig.data.DataBag in project mongo-hadoop by mongodb.
the class BSONLoader method readField.
/**
* Convert an object from a MongoDB document into a type that Pig can
* understand, based on the expectations of the given schema.
* @param obj object from a MongoDB document
* @param field the schema describing this field
* @return an object appropriate for Pig
* @throws IOException
*/
@SuppressWarnings({ "rawtypes", "unchecked" })
protected static Object readField(final Object obj, final ResourceFieldSchema field) throws IOException {
if (obj == null) {
return null;
}
try {
if (field == null) {
return obj;
}
switch(field.getType()) {
case DataType.INTEGER:
return Integer.parseInt(obj.toString());
case DataType.LONG:
return Long.parseLong(obj.toString());
case DataType.FLOAT:
return Float.parseFloat(obj.toString());
case DataType.DOUBLE:
return Double.parseDouble(obj.toString());
case DataType.BYTEARRAY:
return BSONLoader.convertBSONtoPigType(obj);
case DataType.CHARARRAY:
return obj.toString();
case DataType.DATETIME:
return new DateTime(obj);
case DataType.TUPLE:
ResourceSchema s = field.getSchema();
ResourceFieldSchema[] fs = s.getFields();
Tuple t = tupleFactory.newTuple(fs.length);
BasicDBObject val = (BasicDBObject) obj;
for (int j = 0; j < fs.length; j++) {
t.set(j, readField(val.get(fs[j].getName()), fs[j]));
}
return t;
case DataType.BAG:
s = field.getSchema();
fs = s.getFields();
s = fs[0].getSchema();
fs = s.getFields();
DataBag bag = bagFactory.newDefaultBag();
BasicDBList vals = (BasicDBList) obj;
for (Object val1 : vals) {
t = tupleFactory.newTuple(fs.length);
for (int k = 0; k < fs.length; k++) {
t.set(k, readField(((BasicDBObject) val1).get(fs[k].getName()), fs[k]));
}
bag.add(t);
}
return bag;
case DataType.MAP:
s = field.getSchema();
fs = s != null ? s.getFields() : null;
Map outputMap = new HashMap();
if (obj instanceof BSONObject) {
BasicBSONObject inputMap = (BasicBSONObject) obj;
for (String key : inputMap.keySet()) {
if (fs != null) {
outputMap.put(key, readField(inputMap.get(key), fs[0]));
} else {
outputMap.put(key, readField(inputMap.get(key), null));
}
}
} else if (obj instanceof DBRef) {
DBRef ref = (DBRef) obj;
outputMap.put("$ref", ref.getCollectionName());
outputMap.put("$id", ref.getId().toString());
}
return outputMap;
default:
LOG.info("asfkjabskfjbsaf default for " + field.getName());
return BSONLoader.convertBSONtoPigType(obj);
}
} catch (Exception e) {
String fieldName = field.getName() == null ? "" : field.getName();
String type = DataType.genTypeToNameMap().get(field.getType());
LOG.warn("Type " + type + " for field " + fieldName + " can not be applied to " + obj.getClass().toString());
return null;
}
}
use of org.apache.pig.data.DataBag in project mongo-hadoop by mongodb.
the class BSONStorage method getTypeForBSON.
/**
* Returns object more suited for BSON storage. Object o corresponds to a field value in pig.
*
* @param o object representing pig type to convert to BSON-like object
* @param field field to place o in
* @param toIgnore name of field in Object o to ignore
* @return an Object that can be stored as BSON.
* @throws IOException if no schema is available from the field
*/
public static Object getTypeForBSON(final Object o, final ResourceFieldSchema field, final String toIgnore) throws IOException {
byte dataType;
ResourceSchema fieldInnerSchema = null;
if (null == o) {
return null;
}
if (null == field || DataType.UNKNOWN == field.getType()) {
dataType = DataType.findType(o);
} else {
dataType = field.getType();
fieldInnerSchema = field.getSchema();
}
if (dataType == DataType.BYTEARRAY && o instanceof Map) {
dataType = DataType.MAP;
}
switch(dataType) {
case DataType.NULL:
return null;
case DataType.INTEGER:
case DataType.LONG:
case DataType.FLOAT:
case DataType.DOUBLE:
return o;
case DataType.BYTEARRAY:
if (o instanceof PigBoxedBSONValue) {
return ((PigBoxedBSONValue) o).getObject();
}
return o.toString();
case DataType.CHARARRAY:
return o;
case DataType.DATETIME:
return ((DateTime) o).toDate();
//Given a TUPLE, create a Map so BSONEncoder will eat it
case DataType.TUPLE:
// BasicBSONEncoder will consume it as an Iterable.
if (fieldInnerSchema == null) {
return o;
}
// If there was an inner schema, create a Map from the Tuple.
ResourceFieldSchema[] fs = fieldInnerSchema.getFields();
// a bag should be ignored
if (1 == fs.length && fs[0].getName().equals(toIgnore)) {
return getTypeForBSON(((Tuple) o).get(0), fs[0], toIgnore);
}
// If there is more than one field in the tuple or no fields
// to ignore, treat the Tuple as a Map.
Map<String, Object> m = new LinkedHashMap<String, Object>();
for (int j = 0; j < fs.length; j++) {
m.put(fs[j].getName(), getTypeForBSON(((Tuple) o).get(j), fs[j], toIgnore));
}
return m;
// Given a BAG, create an Array so BSONEncoder will eat it.
case DataType.BAG:
// BasicBSONEncoder will consume it as an Iterable.
if (null == fieldInnerSchema) {
return o;
}
fs = fieldInnerSchema.getFields();
ArrayList<Object> bagList = new ArrayList<Object>();
for (Tuple t : (DataBag) o) {
bagList.add(getTypeForBSON(t, fs[0], toIgnore));
}
return bagList;
case DataType.MAP:
if (o == null) {
return null;
}
Map map = (Map) o;
Map<String, Object> out = new HashMap<String, Object>(map.size());
for (Object key : map.keySet()) {
out.put(key.toString(), getTypeForBSON(map.get(key), null, toIgnore));
}
return out;
default:
return o;
}
}
use of org.apache.pig.data.DataBag in project mongo-hadoop by mongodb.
the class JSONPigReplaceTest method testSimpleMultipleReplace.
@Test
public void testSimpleMultipleReplace() throws Exception {
// create tuple ({("Daniel", "Alabi")}, "Carleton College")
// with schema 'b:{b:(f:chararray,l:chararray)}, s:chararray'
Tuple t1 = tupleFactory.newTuple(2);
t1.set(0, "Daniel");
t1.set(1, "Alabi");
DataBag b = bagFactory.newDefaultBag();
b.add(t1);
Tuple t = tupleFactory.newTuple(2);
t.set(0, b);
t.set(1, "Carleton College");
JSONPigReplace j = new JSONPigReplace(new String[] { "{first:'$f', last:'$l', school:'$s'}", "{$push : {schools: '$s'}}" });
BasicBSONObject[] bs = j.substitute(t, "b:{t:(f:chararray,l:chararray)}, s:chararray", null);
assertNotNull(bs);
assertTrue(bs.length == 2);
// should produce
// { "first" : "Daniel" , "last" : "Alabi" , "school" : "Carleton College"}
// and
// { "$push" : { "schools" : "Carleton College"}}
BasicBSONObject res1 = bs[0];
BasicBSONObject res2 = bs[1];
assertEquals(res1.get("first"), "Daniel");
assertEquals(res1.get("last"), "Alabi");
assertEquals(((BasicBSONObject) res2.get("$push")).get("schools"), "Carleton College");
}
use of org.apache.pig.data.DataBag in project mongo-hadoop by mongodb.
the class JSONPigReplaceTest method testNamedArrayReplace.
@Test
public void testNamedArrayReplace() throws Exception {
// create tuple ({("a"), ("b"), ("c")})
// with schema 'cars:{f:(t:chararray)}'
DataBag b = bagFactory.newDefaultBag();
b.add(tupleFactory.newTuple("a"));
b.add(tupleFactory.newTuple("b"));
b.add(tupleFactory.newTuple("c"));
JSONPigReplace j = new JSONPigReplace(new String[] { "{days : [1,2,3], age : 19, cars : '$cars'}" });
BasicBSONObject[] bs = j.substitute(tupleFactory.newTuple(b), "cars : {f:(t:chararray)}", null);
assertNotNull(bs);
assertTrue(bs.length == 1);
// should produce BSONObject
// { "days" : [ 1 , 2 , 3] , "age" : 19 , "cars" : [ { "t" : "a"} , { "t" : "b"} , { "t" : "c"}]}
BasicBSONObject res = bs[0];
ArrayList cars = (ArrayList) res.get("cars");
assertEquals(cars.size(), 3);
Object o = cars.get(0);
assertEquals(((Map) o).get("t"), "a");
}
use of org.apache.pig.data.DataBag in project mongo-hadoop by mongodb.
the class MongoLoaderTest method testSimpleBag.
@Test
public void testSimpleBag() throws IOException {
String userSchema = "b:{t:tuple(t1:chararray, t2:chararray)}";
BasicDBList bag = new BasicDBList();
bag.add(new BasicDBObject().append("t1", "t11_value").append("t2", "t12_value"));
bag.add(new BasicDBObject().append("t1", "t21_value").append("t2", "t22_value"));
MongoLoader ml = new MongoLoader(userSchema);
Object result = BSONLoader.readField(bag, ml.getFields()[0]);
DataBag b = (DataBag) result;
Iterator<Tuple> bit = b.iterator();
Tuple firstInnerT = bit.next();
assertEquals(2, firstInnerT.size());
assertEquals("t11_value", firstInnerT.get(0));
assertEquals("t12_value", firstInnerT.get(1));
Tuple secondInnerT = bit.next();
assertEquals(2, secondInnerT.size());
assertEquals("t21_value", secondInnerT.get(0));
assertEquals("t22_value", secondInnerT.get(1));
assertFalse(bit.hasNext());
}
Aggregations