use of org.apache.hadoop.hive.serde2.AbstractSerDe in project hive by apache.
the class TestLazyBinaryFast method testLazyBinaryFastCase.
public void testLazyBinaryFastCase(int caseNum, boolean doNonRandomFill, Random r) throws Throwable {
SerdeRandomRowSource source = new SerdeRandomRowSource();
source.init(r);
int rowCount = 1000;
Object[][] rows = source.randomRows(rowCount);
if (doNonRandomFill) {
MyTestClass.nonRandomRowFill(rows, source.primitiveCategories());
}
StructObjectInspector rowStructObjectInspector = source.rowStructObjectInspector();
PrimitiveTypeInfo[] primitiveTypeInfos = source.primitiveTypeInfos();
int columnCount = primitiveTypeInfos.length;
int writeColumnCount = columnCount;
StructObjectInspector writeRowStructObjectInspector = rowStructObjectInspector;
boolean doWriteFewerColumns = r.nextBoolean();
if (doWriteFewerColumns) {
writeColumnCount = 1 + r.nextInt(columnCount);
if (writeColumnCount == columnCount) {
doWriteFewerColumns = false;
} else {
writeRowStructObjectInspector = source.partialRowStructObjectInspector(writeColumnCount);
}
}
String fieldNames = ObjectInspectorUtils.getFieldNames(rowStructObjectInspector);
String fieldTypes = ObjectInspectorUtils.getFieldTypes(rowStructObjectInspector);
AbstractSerDe serde = TestLazyBinarySerDe.getSerDe(fieldNames, fieldTypes);
AbstractSerDe serde_fewer = null;
if (doWriteFewerColumns) {
String partialFieldNames = ObjectInspectorUtils.getFieldNames(writeRowStructObjectInspector);
String partialFieldTypes = ObjectInspectorUtils.getFieldTypes(writeRowStructObjectInspector);
serde_fewer = TestLazyBinarySerDe.getSerDe(partialFieldNames, partialFieldTypes);
;
}
testLazyBinaryFast(source, rows, serde, rowStructObjectInspector, serde_fewer, writeRowStructObjectInspector, primitiveTypeInfos, /* useIncludeColumns */
false, /* doWriteFewerColumns */
false, r);
testLazyBinaryFast(source, rows, serde, rowStructObjectInspector, serde_fewer, writeRowStructObjectInspector, primitiveTypeInfos, /* useIncludeColumns */
true, /* doWriteFewerColumns */
false, r);
/*
* Can the LazyBinary format really tolerate writing fewer columns?
*/
// if (doWriteFewerColumns) {
// testLazyBinaryFast(
// source, rows,
// serde, rowStructObjectInspector,
// serde_fewer, writeRowStructObjectInspector,
// primitiveTypeInfos,
// /* useIncludeColumns */ false, /* doWriteFewerColumns */ true, r);
// testLazyBinaryFast(
// source, rows,
// serde, rowStructObjectInspector,
// serde_fewer, writeRowStructObjectInspector,
// primitiveTypeInfos,
// /* useIncludeColumns */ true, /* doWriteFewerColumns */ true, r);
// }
}
use of org.apache.hadoop.hive.serde2.AbstractSerDe in project hive by apache.
the class TestLazyBinaryFast method testLazyBinaryFast.
private void testLazyBinaryFast(SerdeRandomRowSource source, Object[][] rows, AbstractSerDe serde, StructObjectInspector rowOI, AbstractSerDe serde_fewer, StructObjectInspector writeRowOI, PrimitiveTypeInfo[] primitiveTypeInfos, boolean useIncludeColumns, boolean doWriteFewerColumns, Random r) throws Throwable {
int rowCount = rows.length;
int columnCount = primitiveTypeInfos.length;
boolean[] columnsToInclude = null;
if (useIncludeColumns) {
columnsToInclude = new boolean[columnCount];
for (int i = 0; i < columnCount; i++) {
columnsToInclude[i] = r.nextBoolean();
}
}
int writeColumnCount = columnCount;
PrimitiveTypeInfo[] writePrimitiveTypeInfos = primitiveTypeInfos;
if (doWriteFewerColumns) {
writeColumnCount = writeRowOI.getAllStructFieldRefs().size();
writePrimitiveTypeInfos = Arrays.copyOf(primitiveTypeInfos, writeColumnCount);
}
LazyBinarySerializeWrite lazyBinarySerializeWrite = new LazyBinarySerializeWrite(writeColumnCount);
// Try to serialize
BytesWritable[] serializeWriteBytes = new BytesWritable[rowCount];
for (int i = 0; i < rowCount; i++) {
Object[] row = rows[i];
Output output = new Output();
lazyBinarySerializeWrite.set(output);
for (int index = 0; index < writeColumnCount; index++) {
Writable writable = (Writable) row[index];
VerifyFast.serializeWrite(lazyBinarySerializeWrite, primitiveTypeInfos[index], writable);
}
BytesWritable bytesWritable = new BytesWritable();
bytesWritable.set(output.getData(), 0, output.getLength());
serializeWriteBytes[i] = bytesWritable;
}
// Try to deserialize
for (int i = 0; i < rowCount; i++) {
Object[] row = rows[i];
// Specifying the right type info length tells LazyBinaryDeserializeRead which is the last
// column.
LazyBinaryDeserializeRead lazyBinaryDeserializeRead = new LazyBinaryDeserializeRead(writePrimitiveTypeInfos, /* useExternalBuffer */
false);
BytesWritable bytesWritable = serializeWriteBytes[i];
lazyBinaryDeserializeRead.set(bytesWritable.getBytes(), 0, bytesWritable.getLength());
for (int index = 0; index < columnCount; index++) {
if (useIncludeColumns && !columnsToInclude[index]) {
lazyBinaryDeserializeRead.skipNextField();
} else if (index >= writeColumnCount) {
// Should come back a null.
VerifyFast.verifyDeserializeRead(lazyBinaryDeserializeRead, primitiveTypeInfos[index], null);
} else {
Writable writable = (Writable) row[index];
VerifyFast.verifyDeserializeRead(lazyBinaryDeserializeRead, primitiveTypeInfos[index], writable);
}
}
if (writeColumnCount == columnCount) {
TestCase.assertTrue(lazyBinaryDeserializeRead.isEndOfInputReached());
}
}
// Try to deserialize using SerDe class our Writable row objects created by SerializeWrite.
for (int i = 0; i < rowCount; i++) {
BytesWritable bytesWritable = serializeWriteBytes[i];
LazyBinaryStruct lazyBinaryStruct;
if (doWriteFewerColumns) {
lazyBinaryStruct = (LazyBinaryStruct) serde_fewer.deserialize(bytesWritable);
} else {
lazyBinaryStruct = (LazyBinaryStruct) serde.deserialize(bytesWritable);
}
Object[] row = rows[i];
for (int index = 0; index < writeColumnCount; index++) {
PrimitiveTypeInfo primitiveTypeInfo = primitiveTypeInfos[index];
Writable writable = (Writable) row[index];
Object object = lazyBinaryStruct.getField(index);
if (writable == null || object == null) {
if (writable != null || object != null) {
fail("SerDe deserialized NULL column mismatch");
}
} else {
if (!object.equals(writable)) {
fail("SerDe deserialized value does not match");
}
}
}
}
// One Writable per row.
BytesWritable[] serdeBytes = new BytesWritable[rowCount];
// Serialize using the SerDe, then below deserialize using DeserializeRead.
Object[] serdeRow = new Object[writeColumnCount];
for (int i = 0; i < rowCount; i++) {
Object[] row = rows[i];
// LazyBinary seems to work better with an row object array instead of a Java object...
for (int index = 0; index < writeColumnCount; index++) {
serdeRow[index] = row[index];
}
BytesWritable serialized;
if (doWriteFewerColumns) {
serialized = (BytesWritable) serde_fewer.serialize(serdeRow, writeRowOI);
} else {
serialized = (BytesWritable) serde.serialize(serdeRow, rowOI);
}
BytesWritable bytesWritable = new BytesWritable(Arrays.copyOfRange(serialized.getBytes(), 0, serialized.getLength()));
byte[] bytes1 = bytesWritable.getBytes();
BytesWritable lazySerializedWriteBytes = serializeWriteBytes[i];
byte[] bytes2 = Arrays.copyOfRange(lazySerializedWriteBytes.getBytes(), 0, lazySerializedWriteBytes.getLength());
if (bytes1.length != bytes2.length) {
fail("SerializeWrite length " + bytes2.length + " and " + "SerDe serialization length " + bytes1.length + " do not match (" + Arrays.toString(primitiveTypeInfos) + ")");
}
if (!Arrays.equals(bytes1, bytes2)) {
fail("SerializeWrite and SerDe serialization does not match (" + Arrays.toString(primitiveTypeInfos) + ")");
}
serdeBytes[i] = bytesWritable;
}
// Try to deserialize using DeserializeRead our Writable row objects created by SerDe.
for (int i = 0; i < rowCount; i++) {
Object[] row = rows[i];
// When doWriteFewerColumns, try to read more fields than exist in buffer.
LazyBinaryDeserializeRead lazyBinaryDeserializeRead = new LazyBinaryDeserializeRead(primitiveTypeInfos, /* useExternalBuffer */
false);
BytesWritable bytesWritable = serdeBytes[i];
lazyBinaryDeserializeRead.set(bytesWritable.getBytes(), 0, bytesWritable.getLength());
for (int index = 0; index < columnCount; index++) {
if (useIncludeColumns && !columnsToInclude[index]) {
lazyBinaryDeserializeRead.skipNextField();
} else if (index >= writeColumnCount) {
// Should come back a null.
VerifyFast.verifyDeserializeRead(lazyBinaryDeserializeRead, primitiveTypeInfos[index], null);
} else {
Writable writable = (Writable) row[index];
VerifyFast.verifyDeserializeRead(lazyBinaryDeserializeRead, primitiveTypeInfos[index], writable);
}
}
if (writeColumnCount == columnCount) {
TestCase.assertTrue(lazyBinaryDeserializeRead.isEndOfInputReached());
}
}
}
use of org.apache.hadoop.hive.serde2.AbstractSerDe in project hive by apache.
the class DDLTask method describeTable.
/**
* Write the description of a table to a file.
*
* @param db
* The database in question.
* @param descTbl
* This is the table we're interested in.
* @return Returns 0 when execution succeeds and above 0 if it fails.
* @throws HiveException
* Throws this exception if an unexpected error occurs.
* @throws MetaException
*/
private int describeTable(Hive db, DescTableDesc descTbl) throws HiveException, MetaException {
String colPath = descTbl.getColumnPath();
String tableName = descTbl.getTableName();
// describe the table - populate the output stream
Table tbl = db.getTable(tableName, false);
if (tbl == null) {
throw new HiveException(ErrorMsg.INVALID_TABLE, tableName);
}
Partition part = null;
if (descTbl.getPartSpec() != null) {
part = db.getPartition(tbl, descTbl.getPartSpec(), false);
if (part == null) {
throw new HiveException(ErrorMsg.INVALID_PARTITION, StringUtils.join(descTbl.getPartSpec().keySet(), ','), tableName);
}
tbl = part.getTable();
}
DataOutputStream outStream = getOutputStream(descTbl.getResFile());
try {
LOG.debug("DDLTask: got data for {}", tableName);
List<FieldSchema> cols = null;
List<ColumnStatisticsObj> colStats = null;
Deserializer deserializer = tbl.getDeserializer(true);
if (deserializer instanceof AbstractSerDe) {
String errorMsgs = ((AbstractSerDe) deserializer).getConfigurationErrors();
if (errorMsgs != null && !errorMsgs.isEmpty()) {
throw new SQLException(errorMsgs);
}
}
if (colPath.equals(tableName)) {
cols = (part == null || tbl.getTableType() == TableType.VIRTUAL_VIEW) ? tbl.getCols() : part.getCols();
if (!descTbl.isFormatted()) {
cols.addAll(tbl.getPartCols());
}
if (tbl.isPartitioned() && part == null) {
// No partitioned specified for partitioned table, lets fetch all.
Map<String, String> tblProps = tbl.getParameters() == null ? new HashMap<String, String>() : tbl.getParameters();
Map<String, Long> valueMap = new HashMap<>();
Map<String, Boolean> stateMap = new HashMap<>();
for (String stat : StatsSetupConst.supportedStats) {
valueMap.put(stat, 0L);
stateMap.put(stat, true);
}
PartitionIterable parts = new PartitionIterable(db, tbl, null, conf.getIntVar(HiveConf.ConfVars.METASTORE_BATCH_RETRIEVE_MAX));
int numParts = 0;
for (Partition partition : parts) {
Map<String, String> props = partition.getParameters();
Boolean state = StatsSetupConst.areBasicStatsUptoDate(props);
for (String stat : StatsSetupConst.supportedStats) {
stateMap.put(stat, stateMap.get(stat) && state);
if (props != null && props.get(stat) != null) {
valueMap.put(stat, valueMap.get(stat) + Long.parseLong(props.get(stat)));
}
}
numParts++;
}
for (String stat : StatsSetupConst.supportedStats) {
StatsSetupConst.setBasicStatsState(tblProps, Boolean.toString(stateMap.get(stat)));
tblProps.put(stat, valueMap.get(stat).toString());
}
tblProps.put(StatsSetupConst.NUM_PARTITIONS, Integer.toString(numParts));
tbl.setParameters(tblProps);
}
} else {
if (descTbl.isFormatted()) {
// when column name is specified in describe table DDL, colPath will
// will be table_name.column_name
String colName = colPath.split("\\.")[1];
String[] dbTab = Utilities.getDbTableName(tableName);
List<String> colNames = new ArrayList<String>();
colNames.add(colName.toLowerCase());
if (null == part) {
if (tbl.isPartitioned()) {
Map<String, String> tblProps = tbl.getParameters() == null ? new HashMap<String, String>() : tbl.getParameters();
if (tbl.isPartitionKey(colNames.get(0))) {
FieldSchema partCol = tbl.getPartColByName(colNames.get(0));
cols = Collections.singletonList(partCol);
PartitionIterable parts = new PartitionIterable(db, tbl, null, conf.getIntVar(HiveConf.ConfVars.METASTORE_BATCH_RETRIEVE_MAX));
ColumnInfo ci = new ColumnInfo(partCol.getName(), TypeInfoUtils.getTypeInfoFromTypeString(partCol.getType()), null, false);
ColStatistics cs = StatsUtils.getColStatsForPartCol(ci, parts, conf);
ColumnStatisticsData data = new ColumnStatisticsData();
ColStatistics.Range r = cs.getRange();
StatObjectConverter.fillColumnStatisticsData(partCol.getType(), data, r == null ? null : r.minValue, r == null ? null : r.maxValue, r == null ? null : r.minValue, r == null ? null : r.maxValue, r == null ? null : r.minValue.toString(), r == null ? null : r.maxValue.toString(), cs.getNumNulls(), cs.getCountDistint(), null, cs.getAvgColLen(), cs.getAvgColLen(), cs.getNumTrues(), cs.getNumFalses());
ColumnStatisticsObj cso = new ColumnStatisticsObj(partCol.getName(), partCol.getType(), data);
colStats = Collections.singletonList(cso);
StatsSetupConst.setColumnStatsState(tblProps, colNames);
} else {
cols = Hive.getFieldsFromDeserializer(colPath, deserializer);
List<String> parts = db.getPartitionNames(dbTab[0].toLowerCase(), dbTab[1].toLowerCase(), (short) -1);
AggrStats aggrStats = db.getAggrColStatsFor(dbTab[0].toLowerCase(), dbTab[1].toLowerCase(), colNames, parts);
colStats = aggrStats.getColStats();
if (parts.size() == aggrStats.getPartsFound()) {
StatsSetupConst.setColumnStatsState(tblProps, colNames);
} else {
StatsSetupConst.removeColumnStatsState(tblProps, colNames);
}
}
tbl.setParameters(tblProps);
} else {
cols = Hive.getFieldsFromDeserializer(colPath, deserializer);
colStats = db.getTableColumnStatistics(dbTab[0].toLowerCase(), dbTab[1].toLowerCase(), colNames);
}
} else {
List<String> partitions = new ArrayList<String>();
partitions.add(part.getName());
cols = Hive.getFieldsFromDeserializer(colPath, deserializer);
colStats = db.getPartitionColumnStatistics(dbTab[0].toLowerCase(), dbTab[1].toLowerCase(), partitions, colNames).get(part.getName());
}
} else {
cols = Hive.getFieldsFromDeserializer(colPath, deserializer);
}
}
PrimaryKeyInfo pkInfo = null;
ForeignKeyInfo fkInfo = null;
UniqueConstraint ukInfo = null;
NotNullConstraint nnInfo = null;
DefaultConstraint dInfo = null;
CheckConstraint cInfo = null;
if (descTbl.isExt() || descTbl.isFormatted()) {
pkInfo = db.getPrimaryKeys(tbl.getDbName(), tbl.getTableName());
fkInfo = db.getForeignKeys(tbl.getDbName(), tbl.getTableName());
ukInfo = db.getUniqueConstraints(tbl.getDbName(), tbl.getTableName());
nnInfo = db.getNotNullConstraints(tbl.getDbName(), tbl.getTableName());
dInfo = db.getDefaultConstraints(tbl.getDbName(), tbl.getTableName());
cInfo = db.getCheckConstraints(tbl.getDbName(), tbl.getTableName());
}
fixDecimalColumnTypeName(cols);
// In case the query is served by HiveServer2, don't pad it with spaces,
// as HiveServer2 output is consumed by JDBC/ODBC clients.
boolean isOutputPadded = !SessionState.get().isHiveServerQuery();
formatter.describeTable(outStream, colPath, tableName, tbl, part, cols, descTbl.isFormatted(), descTbl.isExt(), isOutputPadded, colStats, pkInfo, fkInfo, ukInfo, nnInfo, dInfo, cInfo);
LOG.debug("DDLTask: written data for {}", tableName);
} catch (SQLException e) {
throw new HiveException(e, ErrorMsg.GENERIC_ERROR, tableName);
} finally {
IOUtils.closeStream(outStream);
}
return 0;
}
use of org.apache.hadoop.hive.serde2.AbstractSerDe in project hive by apache.
the class TestRegexSerDe method testRegexSerDe.
/**
* Test the LazySimpleSerDe class.
*/
@Test
public void testRegexSerDe() throws Throwable {
try {
// Create the SerDe
AbstractSerDe serDe = createSerDe("host,identity,user,time,request,status,size,referer,agent", "string,string,string,string,string,string,string,string,string", "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") " + "([0-9]*) ([0-9]*) ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\")", "%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s");
// Data
Text t = new Text("127.0.0.1 - - [26/May/2009:00:00:00 +0000] " + "\"GET /someurl/?track=Blabla(Main) HTTP/1.1\" 200 5864 - " + "\"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) " + "AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.65 Safari/525.19\"");
// Deserialize
Object row = serDe.deserialize(t);
ObjectInspector rowOI = serDe.getObjectInspector();
System.out.println("Deserialized row: " + row);
// Serialize
Text serialized = (Text) serDe.serialize(row, rowOI);
assertEquals(t, serialized);
// Do some changes (optional)
ObjectInspector standardWritableRowOI = ObjectInspectorUtils.getStandardObjectInspector(rowOI, ObjectInspectorCopyOption.WRITABLE);
Object standardWritableRow = ObjectInspectorUtils.copyToStandardObject(row, rowOI, ObjectInspectorCopyOption.WRITABLE);
// Serialize
serialized = (Text) serDe.serialize(standardWritableRow, standardWritableRowOI);
assertEquals(t, serialized);
} catch (Throwable e) {
e.printStackTrace();
throw e;
}
}
use of org.apache.hadoop.hive.serde2.AbstractSerDe in project hive by apache.
the class ExecReducer method configure.
@Override
public void configure(JobConf job) {
rowObjectInspector = new ObjectInspector[Byte.MAX_VALUE];
ObjectInspector[] valueObjectInspector = new ObjectInspector[Byte.MAX_VALUE];
ObjectInspector keyObjectInspector;
Utilities.tryLoggingClassPaths(job, LOG);
jc = job;
ReduceWork gWork = Utilities.getReduceWork(job);
reducer = gWork.getReducer();
// clear out any parents as reducer is the
reducer.setParentOperators(null);
// root
isTagged = gWork.getNeedsTagging();
try {
keyTableDesc = gWork.getKeyDesc();
inputKeySerDe = ReflectionUtils.newInstance(keyTableDesc.getSerDeClass(), null);
inputKeySerDe.initialize(null, keyTableDesc.getProperties(), null);
keyObjectInspector = inputKeySerDe.getObjectInspector();
valueTableDesc = new TableDesc[gWork.getTagToValueDesc().size()];
for (int tag = 0; tag < gWork.getTagToValueDesc().size(); tag++) {
// We should initialize the SerDe with the TypeInfo when available.
valueTableDesc[tag] = gWork.getTagToValueDesc().get(tag);
AbstractSerDe valueObjectSerDe = ReflectionUtils.newInstance(valueTableDesc[tag].getSerDeClass(), null);
valueObjectSerDe.initialize(null, valueTableDesc[tag].getProperties(), null);
inputValueDeserializer[tag] = valueObjectSerDe;
valueObjectInspector[tag] = inputValueDeserializer[tag].getObjectInspector();
ArrayList<ObjectInspector> ois = new ArrayList<ObjectInspector>();
ois.add(keyObjectInspector);
ois.add(valueObjectInspector[tag]);
rowObjectInspector[tag] = ObjectInspectorFactory.getStandardStructObjectInspector(Utilities.reduceFieldNameList, ois);
}
} catch (Exception e) {
throw new RuntimeException(e);
}
MapredContext.init(false, new JobConf(jc));
// initialize reduce operator tree
try {
LOG.info(reducer.dump(0));
reducer.initialize(jc, rowObjectInspector);
} catch (Throwable e) {
abort = true;
if (e instanceof OutOfMemoryError) {
// Don't create a new object if we are already out of memory
throw (OutOfMemoryError) e;
} else {
throw new RuntimeException("Reduce operator initialization failed", e);
}
}
}
Aggregations