Search in sources :

Example 1 with DataByteArray

use of org.apache.pig.data.DataByteArray in project hive by apache.

the class AbstractHCatStorerTest method testStoreFuncAllSimpleTypes.

@Test
public void testStoreFuncAllSimpleTypes() throws IOException, CommandNeedRetryException {
    driver.run("drop table junit_unparted");
    String createTable = "create table junit_unparted(a int, b float, c double, d bigint, e string, h boolean, f binary, g binary) stored as " + storageFormat;
    int retCode = driver.run(createTable).getResponseCode();
    if (retCode != 0) {
        throw new IOException("Failed to create table.");
    }
    int i = 0;
    String[] input = new String[3];
    // Empty values except first column
    input[i++] = "0\t\t\t\t\t\t\t";
    input[i++] = "\t" + i * 2.1f + "\t" + i * 1.1d + "\t" + i * 2L + "\t" + "lets hcat" + "\t" + "true" + // First column empty
    "\tbinary-data";
    input[i++] = i + "\t" + i * 2.1f + "\t" + i * 1.1d + "\t" + i * 2L + "\t" + "lets hcat" + "\t" + "false" + "\tbinary-data";
    HcatTestUtils.createTestDataFile(INPUT_FILE_NAME, input);
    PigServer server = new PigServer(ExecType.LOCAL);
    server.setBatchOn();
    server.registerQuery("A = load '" + INPUT_FILE_NAME + "' as (a:int, b:float, c:double, d:long, e:chararray, h:boolean, f:bytearray);");
    // null gets stored into column g which is a binary field.
    server.registerQuery("store A into 'default.junit_unparted' using " + HCatStorer.class.getName() + "('','a:int, b:float, c:double, d:long, e:chararray, h:boolean, f:bytearray');");
    server.executeBatch();
    driver.run("select * from junit_unparted");
    ArrayList<String> res = new ArrayList<String>();
    driver.getResults(res);
    Iterator<String> itr = res.iterator();
    String next = itr.next();
    assertEquals("0\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL\tNULL", next);
    assertEquals("NULL\t4.2\t2.2\t4\tlets hcat\ttrue\tbinary-data\tNULL", itr.next());
    assertEquals("3\t6.2999997\t3.3000000000000003\t6\tlets hcat\tfalse\tbinary-data\tNULL", itr.next());
    assertFalse(itr.hasNext());
    server.registerQuery("B = load 'junit_unparted' using " + HCatLoader.class.getName() + ";");
    Iterator<Tuple> iter = server.openIterator("B");
    int count = 0;
    int num5nulls = 0;
    while (iter.hasNext()) {
        Tuple t = iter.next();
        if (t.get(6) == null) {
            num5nulls++;
        } else {
            assertTrue(t.get(6) instanceof DataByteArray);
        }
        assertNull(t.get(7));
        count++;
    }
    assertEquals(3, count);
    assertEquals(1, num5nulls);
    driver.run("drop table junit_unparted");
}
Also used : ArrayList(java.util.ArrayList) IOException(java.io.IOException) PigServer(org.apache.pig.PigServer) DataByteArray(org.apache.pig.data.DataByteArray) Tuple(org.apache.pig.data.Tuple) Test(org.junit.Test) HCatBaseTest(org.apache.hive.hcatalog.mapreduce.HCatBaseTest)

Example 2 with DataByteArray

use of org.apache.pig.data.DataByteArray in project hive by apache.

the class HCatBaseStorer method getJavaObj.

/**
   * Convert from Pig value object to Hive value object
   * This method assumes that {@link #validateSchema(org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema, org.apache.hive.hcatalog.data.schema.HCatFieldSchema, org.apache.pig.impl.logicalLayer.schema.Schema, org.apache.hive.hcatalog.data.schema.HCatSchema, int)}
   * which checks the types in Pig schema are compatible with target Hive table, has been called.
   */
private Object getJavaObj(Object pigObj, HCatFieldSchema hcatFS) throws HCatException, BackendException {
    try {
        if (pigObj == null)
            return null;
        // The real work-horse. Spend time and energy in this method if there is
        // need to keep HCatStorer lean and go fast.
        Type type = hcatFS.getType();
        switch(type) {
            case BINARY:
                return ((DataByteArray) pigObj).get();
            case STRUCT:
                HCatSchema structSubSchema = hcatFS.getStructSubSchema();
                // Unwrap the tuple.
                List<Object> all = ((Tuple) pigObj).getAll();
                ArrayList<Object> converted = new ArrayList<Object>(all.size());
                for (int i = 0; i < all.size(); i++) {
                    converted.add(getJavaObj(all.get(i), structSubSchema.get(i)));
                }
                return converted;
            case ARRAY:
                // Unwrap the bag.
                DataBag pigBag = (DataBag) pigObj;
                HCatFieldSchema tupFS = hcatFS.getArrayElementSchema().get(0);
                boolean needTuple = tupFS.getType() == Type.STRUCT;
                List<Object> bagContents = new ArrayList<Object>((int) pigBag.size());
                Iterator<Tuple> bagItr = pigBag.iterator();
                while (bagItr.hasNext()) {
                    // If there is only one element in tuple contained in bag, we throw away the tuple.
                    bagContents.add(getJavaObj(needTuple ? bagItr.next() : bagItr.next().get(0), tupFS));
                }
                return bagContents;
            case MAP:
                Map<?, ?> pigMap = (Map<?, ?>) pigObj;
                Map<Object, Object> typeMap = new HashMap<Object, Object>();
                for (Entry<?, ?> entry : pigMap.entrySet()) {
                    // the value has a schema and not a FieldSchema
                    typeMap.put(// Schema validation enforces that the Key is a String
                    (String) entry.getKey(), getJavaObj(entry.getValue(), hcatFS.getMapValueSchema().get(0)));
                }
                return typeMap;
            case STRING:
            case INT:
            case BIGINT:
            case FLOAT:
            case DOUBLE:
                return pigObj;
            case SMALLINT:
                if ((Integer) pigObj < Short.MIN_VALUE || (Integer) pigObj > Short.MAX_VALUE) {
                    handleOutOfRangeValue(pigObj, hcatFS);
                    return null;
                }
                return ((Integer) pigObj).shortValue();
            case TINYINT:
                if ((Integer) pigObj < Byte.MIN_VALUE || (Integer) pigObj > Byte.MAX_VALUE) {
                    handleOutOfRangeValue(pigObj, hcatFS);
                    return null;
                }
                return ((Integer) pigObj).byteValue();
            case BOOLEAN:
                if (pigObj instanceof String) {
                    if (((String) pigObj).trim().compareTo("0") == 0) {
                        return Boolean.FALSE;
                    }
                    if (((String) pigObj).trim().compareTo("1") == 0) {
                        return Boolean.TRUE;
                    }
                    throw new BackendException("Unexpected type " + type + " for value " + pigObj + " of class " + pigObj.getClass().getName(), PigHCatUtil.PIG_EXCEPTION_CODE);
                }
                return Boolean.parseBoolean(pigObj.toString());
            case DECIMAL:
                BigDecimal bd = (BigDecimal) pigObj;
                DecimalTypeInfo dti = (DecimalTypeInfo) hcatFS.getTypeInfo();
                if (bd.precision() > dti.precision() || bd.scale() > dti.scale()) {
                    handleOutOfRangeValue(pigObj, hcatFS);
                    return null;
                }
                return HiveDecimal.create(bd);
            case CHAR:
                String charVal = (String) pigObj;
                CharTypeInfo cti = (CharTypeInfo) hcatFS.getTypeInfo();
                if (charVal.length() > cti.getLength()) {
                    handleOutOfRangeValue(pigObj, hcatFS);
                    return null;
                }
                return new HiveChar(charVal, cti.getLength());
            case VARCHAR:
                String varcharVal = (String) pigObj;
                VarcharTypeInfo vti = (VarcharTypeInfo) hcatFS.getTypeInfo();
                if (varcharVal.length() > vti.getLength()) {
                    handleOutOfRangeValue(pigObj, hcatFS);
                    return null;
                }
                return new HiveVarchar(varcharVal, vti.getLength());
            case TIMESTAMP:
                DateTime dt = (DateTime) pigObj;
                //getMillis() returns UTC time regardless of TZ
                return new Timestamp(dt.getMillis());
            case DATE:
                /**
         * We ignore any TZ setting on Pig value since java.sql.Date doesn't have it (in any
         * meaningful way).  So the assumption is that if Pig value has 0 time component (midnight)
         * we assume it reasonably 'fits' into a Hive DATE.  If time part is not 0, it's considered
         * out of range for target type.
         */
                DateTime dateTime = ((DateTime) pigObj);
                if (dateTime.getMillisOfDay() != 0) {
                    handleOutOfRangeValue(pigObj, hcatFS, "Time component must be 0 (midnight) in local timezone; Local TZ val='" + pigObj + "'");
                    return null;
                }
                /*java.sql.Date is a poorly defined API.  Some (all?) SerDes call toString() on it
        [e.g. LazySimpleSerDe, uses LazyUtils.writePrimitiveUTF8()],  which automatically adjusts
          for local timezone.  Date.valueOf() also uses local timezone (as does Date(int,int,int).
          Also see PigHCatUtil#extractPigObject() for corresponding read op.  This way a DATETIME from Pig,
          when stored into Hive and read back comes back with the same value.*/
                return new Date(dateTime.getYear() - 1900, dateTime.getMonthOfYear() - 1, dateTime.getDayOfMonth());
            default:
                throw new BackendException("Unexpected HCat type " + type + " for value " + pigObj + " of class " + pigObj.getClass().getName(), PigHCatUtil.PIG_EXCEPTION_CODE);
        }
    } catch (BackendException e) {
        // provide the path to the field in the error message
        throw new BackendException((hcatFS.getName() == null ? " " : hcatFS.getName() + ".") + e.getMessage(), e);
    }
}
Also used : VarcharTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) HiveChar(org.apache.hadoop.hive.common.type.HiveChar) Timestamp(java.sql.Timestamp) DateTime(org.joda.time.DateTime) HCatSchema(org.apache.hive.hcatalog.data.schema.HCatSchema) DataByteArray(org.apache.pig.data.DataByteArray) DataBag(org.apache.pig.data.DataBag) CharTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo) HiveVarchar(org.apache.hadoop.hive.common.type.HiveVarchar) BigDecimal(java.math.BigDecimal) Date(java.sql.Date) HCatFieldSchema(org.apache.hive.hcatalog.data.schema.HCatFieldSchema) BackendException(org.apache.pig.backend.BackendException) DecimalTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo) DataType(org.apache.pig.data.DataType) Type(org.apache.hive.hcatalog.data.schema.HCatFieldSchema.Type) Map(java.util.Map) HashMap(java.util.HashMap) Tuple(org.apache.pig.data.Tuple)

Example 3 with DataByteArray

use of org.apache.pig.data.DataByteArray in project pygmalion by jeromatron.

the class GenerateBinTimeUUID method exec.

@Override
public DataByteArray exec(Tuple input) throws IOException {
    UUID rval = null;
    if (!input.isNull(0) && input.get(0) instanceof Long) {
        Long time = (Long) input.get(0);
        rval = TimeUUIDUtils.getTimeUUID(time);
    } else {
        rval = TimeUUIDUtils.getUniqueTimeUUIDinMillis();
    }
    return new DataByteArray(TimeUUIDUtils.asByteArray(rval));
}
Also used : UUID(java.util.UUID) DataByteArray(org.apache.pig.data.DataByteArray)

Example 4 with DataByteArray

use of org.apache.pig.data.DataByteArray in project mongo-hadoop by mongodb.

the class BSONLoader method convertBSONtoPigType.

/**
     * Convert an object from a MongoDB document into a type that Pig can
     * understand, based on the type of the input object.
     * @param o object from a MongoDB document
     * @return object appropriate for pig
     * @throws ExecException for lower-level Pig errors
     */
public static Object convertBSONtoPigType(final Object o) throws ExecException {
    if (o == null) {
        return null;
    } else if (o instanceof Number || o instanceof String) {
        return o;
    } else if (o instanceof Date) {
        return ((Date) o).getTime();
    } else if (o instanceof ObjectId) {
        return o.toString();
    } else if (o instanceof UUID) {
        return o.toString();
    } else if (o instanceof BasicBSONList) {
        BasicBSONList bl = (BasicBSONList) o;
        Tuple t = tupleFactory.newTuple(bl.size());
        for (int i = 0; i < bl.size(); i++) {
            t.set(i, convertBSONtoPigType(bl.get(i)));
        }
        return t;
    } else if (o instanceof Map) {
        //TODO make this more efficient for lazy objects?
        Map<String, Object> fieldsMap = (Map<String, Object>) o;
        HashMap<String, Object> pigMap = new HashMap<String, Object>(fieldsMap.size());
        for (Map.Entry<String, Object> field : fieldsMap.entrySet()) {
            pigMap.put(field.getKey(), convertBSONtoPigType(field.getValue()));
        }
        return pigMap;
    } else if (o instanceof byte[]) {
        return new DataByteArray((byte[]) o);
    } else if (o instanceof Binary) {
        return new DataByteArray(((Binary) o).getData());
    } else if (o instanceof DBRef) {
        HashMap<String, String> pigMap = new HashMap<String, String>(2);
        pigMap.put("$ref", ((DBRef) o).getCollectionName());
        pigMap.put("$id", ((DBRef) o).getId().toString());
        return pigMap;
    } else {
        return o;
    }
}
Also used : ObjectId(org.bson.types.ObjectId) HashMap(java.util.HashMap) BasicBSONList(org.bson.types.BasicBSONList) DBRef(com.mongodb.DBRef) Date(java.util.Date) BasicBSONObject(org.bson.BasicBSONObject) BasicDBObject(com.mongodb.BasicDBObject) BSONObject(org.bson.BSONObject) Binary(org.bson.types.Binary) UUID(java.util.UUID) HashMap(java.util.HashMap) Map(java.util.Map) DataByteArray(org.apache.pig.data.DataByteArray) Tuple(org.apache.pig.data.Tuple)

Example 5 with DataByteArray

use of org.apache.pig.data.DataByteArray in project akela by mozilla-metrics.

the class HBaseMultiScanLoader method getNext.

/* (non-Javadoc)
	 * @see org.apache.pig.LoadFunc#getNext()
	 */
@Override
public Tuple getNext() throws IOException {
    try {
        if (reader.nextKeyValue()) {
            ImmutableBytesWritable rowKey = reader.getCurrentKey();
            Result result = reader.getCurrentValue();
            Tuple tuple = TupleFactory.getInstance().newTuple(columns.size() + 1);
            tuple.set(0, new DataByteArray(rowKey.get()));
            int i = 1;
            for (Pair<String, String> pair : columns) {
                byte[] v = result.getValue(pair.getFirst().getBytes(), pair.getSecond().getBytes());
                if (v != null) {
                    tuple.set(i, new DataByteArray(v));
                }
                i++;
            }
            return tuple;
        }
    } catch (InterruptedException e) {
        throw new IOException(e);
    }
    return null;
}
Also used : ImmutableBytesWritable(org.apache.hadoop.hbase.io.ImmutableBytesWritable) IOException(java.io.IOException) DataByteArray(org.apache.pig.data.DataByteArray) Tuple(org.apache.pig.data.Tuple) Result(org.apache.hadoop.hbase.client.Result)

Aggregations

DataByteArray (org.apache.pig.data.DataByteArray)11 Tuple (org.apache.pig.data.Tuple)8 IOException (java.io.IOException)4 Map (java.util.Map)4 Timestamp (java.sql.Timestamp)3 ArrayList (java.util.ArrayList)3 HashMap (java.util.HashMap)3 DateTime (org.joda.time.DateTime)3 Date (java.sql.Date)2 UUID (java.util.UUID)2 Type (org.apache.hive.hcatalog.data.schema.HCatFieldSchema.Type)2 DataType (org.apache.pig.data.DataType)2 ImmutableMap (com.google.common.collect.ImmutableMap)1 BasicDBObject (com.mongodb.BasicDBObject)1 DBRef (com.mongodb.DBRef)1 InterruptedException (java.lang.InterruptedException)1 BigDecimal (java.math.BigDecimal)1 ResultSet (java.sql.ResultSet)1 SQLException (java.sql.SQLException)1 Statement (java.sql.Statement)1