use of org.apache.pig.data.DataByteArray in project hive by apache.
the class AbstractHCatStorerTest method testStoreFuncAllSimpleTypes.
@Test
public void testStoreFuncAllSimpleTypes() throws IOException, CommandNeedRetryException {
driver.run("drop table junit_unparted");
String createTable = "create table junit_unparted(a int, b float, c double, d bigint, e string, h boolean, f binary, g binary) stored as " + storageFormat;
int retCode = driver.run(createTable).getResponseCode();
if (retCode != 0) {
throw new IOException("Failed to create table.");
}
int i = 0;
String[] input = new String[3];
// Empty values except first column
input[i++] = "0\t\t\t\t\t\t\t";
input[i++] = "\t" + i * 2.1f + "\t" + i * 1.1d + "\t" + i * 2L + "\t" + "lets hcat" + "\t" + "true" + // First column empty
"\tbinary-data";
input[i++] = i + "\t" + i * 2.1f + "\t" + i * 1.1d + "\t" + i * 2L + "\t" + "lets hcat" + "\t" + "false" + "\tbinary-data";
HcatTestUtils.createTestDataFile(INPUT_FILE_NAME, input);
PigServer server = new PigServer(ExecType.LOCAL);
server.setBatchOn();
server.registerQuery("A = load '" + INPUT_FILE_NAME + "' as (a:int, b:float, c:double, d:long, e:chararray, h:boolean, f:bytearray);");
// null gets stored into column g which is a binary field.
server.registerQuery("store A into 'default.junit_unparted' using " + HCatStorer.class.getName() + "('','a:int, b:float, c:double, d:long, e:chararray, h:boolean, f:bytearray');");
server.executeBatch();
driver.run("select * from junit_unparted");
ArrayList<String> res = new ArrayList<String>();
driver.getResults(res);
Iterator<String> itr = res.iterator();
String next = itr.next();
assertEquals("0\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL", next);
assertEquals("NULL\t4.2\t2.2\t4\tlets hcat\ttrue\tbinary-data\tNULL", itr.next());
assertEquals("3\t6.2999997\t3.3000000000000003\t6\tlets hcat\tfalse\tbinary-data\tNULL", itr.next());
assertFalse(itr.hasNext());
server.registerQuery("B = load 'junit_unparted' using " + HCatLoader.class.getName() + ";");
Iterator<Tuple> iter = server.openIterator("B");
int count = 0;
int num5nulls = 0;
while (iter.hasNext()) {
Tuple t = iter.next();
if (t.get(6) == null) {
num5nulls++;
} else {
assertTrue(t.get(6) instanceof DataByteArray);
}
assertNull(t.get(7));
count++;
}
assertEquals(3, count);
assertEquals(1, num5nulls);
driver.run("drop table junit_unparted");
}
use of org.apache.pig.data.DataByteArray in project hive by apache.
the class HCatBaseStorer method getJavaObj.
/**
* Convert from Pig value object to Hive value object
* This method assumes that {@link #validateSchema(org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema, org.apache.hive.hcatalog.data.schema.HCatFieldSchema, org.apache.pig.impl.logicalLayer.schema.Schema, org.apache.hive.hcatalog.data.schema.HCatSchema, int)}
* which checks the types in Pig schema are compatible with target Hive table, has been called.
*/
private Object getJavaObj(Object pigObj, HCatFieldSchema hcatFS) throws HCatException, BackendException {
try {
if (pigObj == null)
return null;
// The real work-horse. Spend time and energy in this method if there is
// need to keep HCatStorer lean and go fast.
Type type = hcatFS.getType();
switch(type) {
case BINARY:
return ((DataByteArray) pigObj).get();
case STRUCT:
HCatSchema structSubSchema = hcatFS.getStructSubSchema();
// Unwrap the tuple.
List<Object> all = ((Tuple) pigObj).getAll();
ArrayList<Object> converted = new ArrayList<Object>(all.size());
for (int i = 0; i < all.size(); i++) {
converted.add(getJavaObj(all.get(i), structSubSchema.get(i)));
}
return converted;
case ARRAY:
// Unwrap the bag.
DataBag pigBag = (DataBag) pigObj;
HCatFieldSchema tupFS = hcatFS.getArrayElementSchema().get(0);
boolean needTuple = tupFS.getType() == Type.STRUCT;
List<Object> bagContents = new ArrayList<Object>((int) pigBag.size());
Iterator<Tuple> bagItr = pigBag.iterator();
while (bagItr.hasNext()) {
// If there is only one element in tuple contained in bag, we throw away the tuple.
bagContents.add(getJavaObj(needTuple ? bagItr.next() : bagItr.next().get(0), tupFS));
}
return bagContents;
case MAP:
Map<?, ?> pigMap = (Map<?, ?>) pigObj;
Map<Object, Object> typeMap = new HashMap<Object, Object>();
for (Entry<?, ?> entry : pigMap.entrySet()) {
// the value has a schema and not a FieldSchema
typeMap.put(// Schema validation enforces that the Key is a String
(String) entry.getKey(), getJavaObj(entry.getValue(), hcatFS.getMapValueSchema().get(0)));
}
return typeMap;
case STRING:
case INT:
case BIGINT:
case FLOAT:
case DOUBLE:
return pigObj;
case SMALLINT:
if ((Integer) pigObj < Short.MIN_VALUE || (Integer) pigObj > Short.MAX_VALUE) {
handleOutOfRangeValue(pigObj, hcatFS);
return null;
}
return ((Integer) pigObj).shortValue();
case TINYINT:
if ((Integer) pigObj < Byte.MIN_VALUE || (Integer) pigObj > Byte.MAX_VALUE) {
handleOutOfRangeValue(pigObj, hcatFS);
return null;
}
return ((Integer) pigObj).byteValue();
case BOOLEAN:
if (pigObj instanceof String) {
if (((String) pigObj).trim().compareTo("0") == 0) {
return Boolean.FALSE;
}
if (((String) pigObj).trim().compareTo("1") == 0) {
return Boolean.TRUE;
}
throw new BackendException("Unexpected type " + type + " for value " + pigObj + " of class " + pigObj.getClass().getName(), PigHCatUtil.PIG_EXCEPTION_CODE);
}
return Boolean.parseBoolean(pigObj.toString());
case DECIMAL:
BigDecimal bd = (BigDecimal) pigObj;
DecimalTypeInfo dti = (DecimalTypeInfo) hcatFS.getTypeInfo();
if (bd.precision() > dti.precision() || bd.scale() > dti.scale()) {
handleOutOfRangeValue(pigObj, hcatFS);
return null;
}
return HiveDecimal.create(bd);
case CHAR:
String charVal = (String) pigObj;
CharTypeInfo cti = (CharTypeInfo) hcatFS.getTypeInfo();
if (charVal.length() > cti.getLength()) {
handleOutOfRangeValue(pigObj, hcatFS);
return null;
}
return new HiveChar(charVal, cti.getLength());
case VARCHAR:
String varcharVal = (String) pigObj;
VarcharTypeInfo vti = (VarcharTypeInfo) hcatFS.getTypeInfo();
if (varcharVal.length() > vti.getLength()) {
handleOutOfRangeValue(pigObj, hcatFS);
return null;
}
return new HiveVarchar(varcharVal, vti.getLength());
case TIMESTAMP:
DateTime dt = (DateTime) pigObj;
//getMillis() returns UTC time regardless of TZ
return new Timestamp(dt.getMillis());
case DATE:
/**
* We ignore any TZ setting on Pig value since java.sql.Date doesn't have it (in any
* meaningful way). So the assumption is that if Pig value has 0 time component (midnight)
* we assume it reasonably 'fits' into a Hive DATE. If time part is not 0, it's considered
* out of range for target type.
*/
DateTime dateTime = ((DateTime) pigObj);
if (dateTime.getMillisOfDay() != 0) {
handleOutOfRangeValue(pigObj, hcatFS, "Time component must be 0 (midnight) in local timezone; Local TZ val='" + pigObj + "'");
return null;
}
/*java.sql.Date is a poorly defined API. Some (all?) SerDes call toString() on it
[e.g. LazySimpleSerDe, uses LazyUtils.writePrimitiveUTF8()], which automatically adjusts
for local timezone. Date.valueOf() also uses local timezone (as does Date(int,int,int).
Also see PigHCatUtil#extractPigObject() for corresponding read op. This way a DATETIME from Pig,
when stored into Hive and read back comes back with the same value.*/
return new Date(dateTime.getYear() - 1900, dateTime.getMonthOfYear() - 1, dateTime.getDayOfMonth());
default:
throw new BackendException("Unexpected HCat type " + type + " for value " + pigObj + " of class " + pigObj.getClass().getName(), PigHCatUtil.PIG_EXCEPTION_CODE);
}
} catch (BackendException e) {
// provide the path to the field in the error message
throw new BackendException((hcatFS.getName() == null ? " " : hcatFS.getName() + ".") + e.getMessage(), e);
}
}
use of org.apache.pig.data.DataByteArray in project pygmalion by jeromatron.
the class GenerateBinTimeUUID method exec.
@Override
public DataByteArray exec(Tuple input) throws IOException {
UUID rval = null;
if (!input.isNull(0) && input.get(0) instanceof Long) {
Long time = (Long) input.get(0);
rval = TimeUUIDUtils.getTimeUUID(time);
} else {
rval = TimeUUIDUtils.getUniqueTimeUUIDinMillis();
}
return new DataByteArray(TimeUUIDUtils.asByteArray(rval));
}
use of org.apache.pig.data.DataByteArray in project mongo-hadoop by mongodb.
the class BSONLoader method convertBSONtoPigType.
/**
* Convert an object from a MongoDB document into a type that Pig can
* understand, based on the type of the input object.
* @param o object from a MongoDB document
* @return object appropriate for pig
* @throws ExecException for lower-level Pig errors
*/
public static Object convertBSONtoPigType(final Object o) throws ExecException {
if (o == null) {
return null;
} else if (o instanceof Number || o instanceof String) {
return o;
} else if (o instanceof Date) {
return ((Date) o).getTime();
} else if (o instanceof ObjectId) {
return o.toString();
} else if (o instanceof UUID) {
return o.toString();
} else if (o instanceof BasicBSONList) {
BasicBSONList bl = (BasicBSONList) o;
Tuple t = tupleFactory.newTuple(bl.size());
for (int i = 0; i < bl.size(); i++) {
t.set(i, convertBSONtoPigType(bl.get(i)));
}
return t;
} else if (o instanceof Map) {
//TODO make this more efficient for lazy objects?
Map<String, Object> fieldsMap = (Map<String, Object>) o;
HashMap<String, Object> pigMap = new HashMap<String, Object>(fieldsMap.size());
for (Map.Entry<String, Object> field : fieldsMap.entrySet()) {
pigMap.put(field.getKey(), convertBSONtoPigType(field.getValue()));
}
return pigMap;
} else if (o instanceof byte[]) {
return new DataByteArray((byte[]) o);
} else if (o instanceof Binary) {
return new DataByteArray(((Binary) o).getData());
} else if (o instanceof DBRef) {
HashMap<String, String> pigMap = new HashMap<String, String>(2);
pigMap.put("$ref", ((DBRef) o).getCollectionName());
pigMap.put("$id", ((DBRef) o).getId().toString());
return pigMap;
} else {
return o;
}
}
use of org.apache.pig.data.DataByteArray in project akela by mozilla-metrics.
the class HBaseMultiScanLoader method getNext.
/* (non-Javadoc)
* @see org.apache.pig.LoadFunc#getNext()
*/
@Override
public Tuple getNext() throws IOException {
try {
if (reader.nextKeyValue()) {
ImmutableBytesWritable rowKey = reader.getCurrentKey();
Result result = reader.getCurrentValue();
Tuple tuple = TupleFactory.getInstance().newTuple(columns.size() + 1);
tuple.set(0, new DataByteArray(rowKey.get()));
int i = 1;
for (Pair<String, String> pair : columns) {
byte[] v = result.getValue(pair.getFirst().getBytes(), pair.getSecond().getBytes());
if (v != null) {
tuple.set(i, new DataByteArray(v));
}
i++;
}
return tuple;
}
} catch (InterruptedException e) {
throw new IOException(e);
}
return null;
}
Aggregations