use of org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe in project hive by apache.
the class TestRegexSerDe method testRegexSerDe.
/**
* Test the LazySimpleSerDe class.
*/
public void testRegexSerDe() throws Throwable {
try {
// Create the SerDe
AbstractSerDe serDe = createSerDe("host,identity,user,time,request,status,size,referer,agent", "string,string,string,string,string,string,string,string,string", "([^ ]*) ([^ ]*) ([^ ]*) (-|\\[[^\\]]*\\]) ([^ \"]*|\"[^\"]*\") " + "([0-9]*) ([0-9]*) ([^ \"]*|\"[^\"]*\") ([^ \"]*|\"[^\"]*\")", "%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s");
// Data
Text t = new Text("127.0.0.1 - - [26/May/2009:00:00:00 +0000] " + "\"GET /someurl/?track=Blabla(Main) HTTP/1.1\" 200 5864 - " + "\"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) " + "AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.65 Safari/525.19\"");
// Deserialize
Object row = serDe.deserialize(t);
ObjectInspector rowOI = serDe.getObjectInspector();
System.out.println("Deserialized row: " + row);
// Serialize
Text serialized = (Text) serDe.serialize(row, rowOI);
assertEquals(t, serialized);
// Do some changes (optional)
ObjectInspector standardWritableRowOI = ObjectInspectorUtils.getStandardObjectInspector(rowOI, ObjectInspectorCopyOption.WRITABLE);
Object standardWritableRow = ObjectInspectorUtils.copyToStandardObject(row, rowOI, ObjectInspectorCopyOption.WRITABLE);
// Serialize
serialized = (Text) serDe.serialize(standardWritableRow, standardWritableRowOI);
assertEquals(t, serialized);
} catch (Throwable e) {
e.printStackTrace();
throw e;
}
}
use of org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe in project hive by apache.
the class DDLTask method alterTableOrSinglePartition.
private List<Task<?>> alterTableOrSinglePartition(AlterTableDesc alterTbl, Table tbl, Partition part) throws HiveException {
EnvironmentContext environmentContext = alterTbl.getEnvironmentContext();
if (environmentContext == null) {
environmentContext = new EnvironmentContext();
alterTbl.setEnvironmentContext(environmentContext);
}
// do not need update stats in alter table/partition operations
if (environmentContext.getProperties() == null || environmentContext.getProperties().get(StatsSetupConst.DO_NOT_UPDATE_STATS) == null) {
environmentContext.putToProperties(StatsSetupConst.DO_NOT_UPDATE_STATS, StatsSetupConst.TRUE);
}
if (alterTbl.getOp() == AlterTableDesc.AlterTableTypes.RENAME) {
tbl.setDbName(Utilities.getDatabaseName(alterTbl.getNewName()));
tbl.setTableName(Utilities.getTableName(alterTbl.getNewName()));
} else if (alterTbl.getOp() == AlterTableDesc.AlterTableTypes.ADDCOLS) {
StorageDescriptor sd = retrieveStorageDescriptor(tbl, part);
String serializationLib = sd.getSerdeInfo().getSerializationLib();
AvroSerdeUtils.handleAlterTableForAvro(conf, serializationLib, tbl.getTTable().getParameters());
List<FieldSchema> oldCols = (part == null ? tbl.getColsForMetastore() : part.getColsForMetastore());
List<FieldSchema> newCols = alterTbl.getNewCols();
if (serializationLib.equals("org.apache.hadoop.hive.serde.thrift.columnsetSerDe")) {
console.printInfo("Replacing columns for columnsetSerDe and changing to LazySimpleSerDe");
sd.getSerdeInfo().setSerializationLib(LazySimpleSerDe.class.getName());
sd.setCols(newCols);
} else {
// make sure the columns does not already exist
Iterator<FieldSchema> iterNewCols = newCols.iterator();
while (iterNewCols.hasNext()) {
FieldSchema newCol = iterNewCols.next();
String newColName = newCol.getName();
Iterator<FieldSchema> iterOldCols = oldCols.iterator();
while (iterOldCols.hasNext()) {
String oldColName = iterOldCols.next().getName();
if (oldColName.equalsIgnoreCase(newColName)) {
throw new HiveException(ErrorMsg.DUPLICATE_COLUMN_NAMES, newColName);
}
}
oldCols.add(newCol);
}
sd.setCols(oldCols);
}
} else if (alterTbl.getOp() == AlterTableDesc.AlterTableTypes.RENAMECOLUMN) {
StorageDescriptor sd = retrieveStorageDescriptor(tbl, part);
String serializationLib = sd.getSerdeInfo().getSerializationLib();
AvroSerdeUtils.handleAlterTableForAvro(conf, serializationLib, tbl.getTTable().getParameters());
List<FieldSchema> oldCols = (part == null ? tbl.getColsForMetastore() : part.getColsForMetastore());
List<FieldSchema> newCols = new ArrayList<FieldSchema>();
Iterator<FieldSchema> iterOldCols = oldCols.iterator();
String oldName = alterTbl.getOldColName();
String newName = alterTbl.getNewColName();
String type = alterTbl.getNewColType();
String comment = alterTbl.getNewColComment();
boolean first = alterTbl.getFirst();
String afterCol = alterTbl.getAfterCol();
// if orc table, restrict reordering columns as it will break schema evolution
boolean isOrcSchemaEvolution = sd.getInputFormat().equals(OrcInputFormat.class.getName()) && isSchemaEvolutionEnabled(tbl);
if (isOrcSchemaEvolution && (first || (afterCol != null && !afterCol.trim().isEmpty()))) {
throw new HiveException(ErrorMsg.CANNOT_REORDER_COLUMNS, alterTbl.getOldName());
}
FieldSchema column = null;
boolean found = false;
int position = -1;
if (first) {
position = 0;
}
int i = 1;
while (iterOldCols.hasNext()) {
FieldSchema col = iterOldCols.next();
String oldColName = col.getName();
if (oldColName.equalsIgnoreCase(newName) && !oldColName.equalsIgnoreCase(oldName)) {
throw new HiveException(ErrorMsg.DUPLICATE_COLUMN_NAMES, newName);
} else if (oldColName.equalsIgnoreCase(oldName)) {
col.setName(newName);
if (type != null && !type.trim().equals("")) {
col.setType(type);
}
if (comment != null) {
col.setComment(comment);
}
found = true;
if (first || (afterCol != null && !afterCol.trim().equals(""))) {
column = col;
continue;
}
}
if (afterCol != null && !afterCol.trim().equals("") && oldColName.equalsIgnoreCase(afterCol)) {
position = i;
}
i++;
newCols.add(col);
}
// did not find the column
if (!found) {
throw new HiveException(ErrorMsg.INVALID_COLUMN, oldName);
}
// after column is not null, but we did not find it.
if ((afterCol != null && !afterCol.trim().equals("")) && position < 0) {
throw new HiveException(ErrorMsg.INVALID_COLUMN, afterCol);
}
if (position >= 0) {
newCols.add(position, column);
}
sd.setCols(newCols);
} else if (alterTbl.getOp() == AlterTableDesc.AlterTableTypes.REPLACECOLS) {
StorageDescriptor sd = retrieveStorageDescriptor(tbl, part);
// change SerDe to LazySimpleSerDe if it is columnsetSerDe
String serializationLib = sd.getSerdeInfo().getSerializationLib();
if (serializationLib.equals("org.apache.hadoop.hive.serde.thrift.columnsetSerDe")) {
console.printInfo("Replacing columns for columnsetSerDe and changing to LazySimpleSerDe");
sd.getSerdeInfo().setSerializationLib(LazySimpleSerDe.class.getName());
} else if (!serializationLib.equals(MetadataTypedColumnsetSerDe.class.getName()) && !serializationLib.equals(LazySimpleSerDe.class.getName()) && !serializationLib.equals(ColumnarSerDe.class.getName()) && !serializationLib.equals(DynamicSerDe.class.getName()) && !serializationLib.equals(ParquetHiveSerDe.class.getName()) && !serializationLib.equals(OrcSerde.class.getName())) {
throw new HiveException(ErrorMsg.CANNOT_REPLACE_COLUMNS, alterTbl.getOldName());
}
final boolean isOrcSchemaEvolution = serializationLib.equals(OrcSerde.class.getName()) && isSchemaEvolutionEnabled(tbl);
// adding columns and limited integer type promotion is supported for ORC schema evolution
if (isOrcSchemaEvolution) {
final List<FieldSchema> existingCols = sd.getCols();
final List<FieldSchema> replaceCols = alterTbl.getNewCols();
if (replaceCols.size() < existingCols.size()) {
throw new HiveException(ErrorMsg.REPLACE_CANNOT_DROP_COLUMNS, alterTbl.getOldName());
}
}
boolean partitioned = tbl.isPartitioned();
boolean droppingColumns = alterTbl.getNewCols().size() < sd.getCols().size();
if (ParquetHiveSerDe.isParquetTable(tbl) && isSchemaEvolutionEnabled(tbl) && !alterTbl.getIsCascade() && droppingColumns && partitioned) {
LOG.warn("Cannot drop columns from a partitioned parquet table without the CASCADE option");
throw new HiveException(ErrorMsg.REPLACE_CANNOT_DROP_COLUMNS, alterTbl.getOldName());
}
sd.setCols(alterTbl.getNewCols());
} else if (alterTbl.getOp() == AlterTableDesc.AlterTableTypes.ADDPROPS) {
return alterTableAddProps(alterTbl, tbl, part, environmentContext);
} else if (alterTbl.getOp() == AlterTableDesc.AlterTableTypes.DROPPROPS) {
return alterTableDropProps(alterTbl, tbl, part, environmentContext);
} else if (alterTbl.getOp() == AlterTableDesc.AlterTableTypes.ADDSERDEPROPS) {
StorageDescriptor sd = retrieveStorageDescriptor(tbl, part);
sd.getSerdeInfo().getParameters().putAll(alterTbl.getProps());
} else if (alterTbl.getOp() == AlterTableDesc.AlterTableTypes.ADDSERDE) {
StorageDescriptor sd = retrieveStorageDescriptor(tbl, part);
String serdeName = alterTbl.getSerdeName();
String oldSerdeName = sd.getSerdeInfo().getSerializationLib();
// if orc table, restrict changing the serde as it can break schema evolution
if (isSchemaEvolutionEnabled(tbl) && oldSerdeName.equalsIgnoreCase(OrcSerde.class.getName()) && !serdeName.equalsIgnoreCase(OrcSerde.class.getName())) {
throw new HiveException(ErrorMsg.CANNOT_CHANGE_SERDE, OrcSerde.class.getSimpleName(), alterTbl.getOldName());
}
sd.getSerdeInfo().setSerializationLib(serdeName);
if ((alterTbl.getProps() != null) && (alterTbl.getProps().size() > 0)) {
sd.getSerdeInfo().getParameters().putAll(alterTbl.getProps());
}
if (part != null) {
// TODO: wtf? This doesn't do anything.
part.getTPartition().getSd().setCols(part.getTPartition().getSd().getCols());
} else {
if (Table.shouldStoreFieldsInMetastore(conf, serdeName, tbl.getParameters()) && !Table.hasMetastoreBasedSchema(conf, oldSerdeName)) {
// from old SerDe are too long to be stored in metastore, but there's nothing we can do.
try {
Deserializer oldSerde = HiveMetaStoreUtils.getDeserializer(conf, tbl.getTTable(), false, oldSerdeName);
tbl.setFields(Hive.getFieldsFromDeserializer(tbl.getTableName(), oldSerde));
} catch (MetaException ex) {
throw new HiveException(ex);
}
}
}
} else if (alterTbl.getOp() == AlterTableDesc.AlterTableTypes.ADDFILEFORMAT) {
StorageDescriptor sd = retrieveStorageDescriptor(tbl, part);
// if orc table, restrict changing the file format as it can break schema evolution
if (isSchemaEvolutionEnabled(tbl) && sd.getInputFormat().equals(OrcInputFormat.class.getName()) && !alterTbl.getInputFormat().equals(OrcInputFormat.class.getName())) {
throw new HiveException(ErrorMsg.CANNOT_CHANGE_FILEFORMAT, "ORC", alterTbl.getOldName());
}
sd.setInputFormat(alterTbl.getInputFormat());
sd.setOutputFormat(alterTbl.getOutputFormat());
if (alterTbl.getSerdeName() != null) {
sd.getSerdeInfo().setSerializationLib(alterTbl.getSerdeName());
}
} else if (alterTbl.getOp() == AlterTableDesc.AlterTableTypes.ADDCLUSTERSORTCOLUMN) {
StorageDescriptor sd = retrieveStorageDescriptor(tbl, part);
// validate sort columns and bucket columns
List<String> columns = Utilities.getColumnNamesFromFieldSchema(tbl.getCols());
if (!alterTbl.isTurnOffSorting()) {
Utilities.validateColumnNames(columns, alterTbl.getBucketColumns());
}
if (alterTbl.getSortColumns() != null) {
Utilities.validateColumnNames(columns, Utilities.getColumnNamesFromSortCols(alterTbl.getSortColumns()));
}
if (alterTbl.isTurnOffSorting()) {
sd.setSortCols(new ArrayList<Order>());
} else if (alterTbl.getNumberBuckets() == -1) {
// -1 buckets means to turn off bucketing
sd.setBucketCols(new ArrayList<String>());
sd.setNumBuckets(-1);
sd.setSortCols(new ArrayList<Order>());
} else {
sd.setBucketCols(alterTbl.getBucketColumns());
sd.setNumBuckets(alterTbl.getNumberBuckets());
sd.setSortCols(alterTbl.getSortColumns());
}
} else if (alterTbl.getOp() == AlterTableDesc.AlterTableTypes.ALTERLOCATION) {
StorageDescriptor sd = retrieveStorageDescriptor(tbl, part);
String newLocation = alterTbl.getNewLocation();
try {
URI locUri = new URI(newLocation);
if (!new Path(locUri).isAbsolute()) {
throw new HiveException(ErrorMsg.BAD_LOCATION_VALUE, newLocation);
}
sd.setLocation(newLocation);
} catch (URISyntaxException e) {
throw new HiveException(e);
}
environmentContext.getProperties().remove(StatsSetupConst.DO_NOT_UPDATE_STATS);
} else if (alterTbl.getOp() == AlterTableDesc.AlterTableTypes.ADDSKEWEDBY) {
// Validation's been done at compile time. no validation is needed here.
List<String> skewedColNames = null;
List<List<String>> skewedValues = null;
if (alterTbl.isTurnOffSkewed()) {
// Convert skewed table to non-skewed table.
skewedColNames = new ArrayList<String>();
skewedValues = new ArrayList<List<String>>();
} else {
skewedColNames = alterTbl.getSkewedColNames();
skewedValues = alterTbl.getSkewedColValues();
}
if (null == tbl.getSkewedInfo()) {
// Convert non-skewed table to skewed table.
SkewedInfo skewedInfo = new SkewedInfo();
skewedInfo.setSkewedColNames(skewedColNames);
skewedInfo.setSkewedColValues(skewedValues);
tbl.setSkewedInfo(skewedInfo);
} else {
tbl.setSkewedColNames(skewedColNames);
tbl.setSkewedColValues(skewedValues);
}
tbl.setStoredAsSubDirectories(alterTbl.isStoredAsSubDirectories());
} else if (alterTbl.getOp() == AlterTableDesc.AlterTableTypes.ALTERSKEWEDLOCATION) {
// process location one-by-one
Map<List<String>, String> locMaps = alterTbl.getSkewedLocations();
Set<List<String>> keys = locMaps.keySet();
for (List<String> key : keys) {
String newLocation = locMaps.get(key);
try {
URI locUri = new URI(newLocation);
if (part != null) {
List<String> slk = new ArrayList<String>(key);
part.setSkewedValueLocationMap(slk, locUri.toString());
} else {
List<String> slk = new ArrayList<String>(key);
tbl.setSkewedValueLocationMap(slk, locUri.toString());
}
} catch (URISyntaxException e) {
throw new HiveException(e);
}
}
environmentContext.getProperties().remove(StatsSetupConst.DO_NOT_UPDATE_STATS);
} else if (alterTbl.getOp() == AlterTableTypes.ALTERBUCKETNUM) {
if (part != null) {
if (part.getBucketCount() == alterTbl.getNumberBuckets()) {
return null;
}
part.setBucketCount(alterTbl.getNumberBuckets());
} else {
if (tbl.getNumBuckets() == alterTbl.getNumberBuckets()) {
return null;
}
tbl.setNumBuckets(alterTbl.getNumberBuckets());
}
} else {
throw new HiveException(ErrorMsg.UNSUPPORTED_ALTER_TBL_OP, alterTbl.getOp().toString());
}
return null;
}
use of org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe in project hive by apache.
the class VectorDeserializeOrcWriter method create.
// TODO: if more writers are added, separate out an EncodingWriterFactory
public static EncodingWriter create(InputFormat<?, ?> sourceIf, Deserializer serDe, Map<Path, PartitionDesc> parts, Configuration daemonConf, Configuration jobConf, Path splitPath, StructObjectInspector sourceOi, List<Integer> sourceIncludes, boolean[] cacheIncludes, int allocSize) throws IOException {
// Vector SerDe can be disabled both on client and server side.
if (!HiveConf.getBoolVar(daemonConf, ConfVars.LLAP_IO_ENCODE_VECTOR_SERDE_ENABLED) || !HiveConf.getBoolVar(jobConf, ConfVars.LLAP_IO_ENCODE_VECTOR_SERDE_ENABLED) || !(sourceIf instanceof TextInputFormat) || !(serDe instanceof LazySimpleSerDe)) {
return new DeserializerOrcWriter(serDe, sourceOi, allocSize);
}
Path path = splitPath.getFileSystem(daemonConf).makeQualified(splitPath);
PartitionDesc partDesc = HiveFileFormatUtils.getFromPathRecursively(parts, path, null);
if (partDesc == null) {
LlapIoImpl.LOG.info("Not using VertorDeserializeOrcWriter: no partition desc for " + path);
return new DeserializerOrcWriter(serDe, sourceOi, allocSize);
}
Properties tblProps = partDesc.getTableDesc().getProperties();
if ("true".equalsIgnoreCase(tblProps.getProperty(serdeConstants.SERIALIZATION_LAST_COLUMN_TAKES_REST))) {
LlapIoImpl.LOG.info("Not using VertorDeserializeOrcWriter due to " + serdeConstants.SERIALIZATION_LAST_COLUMN_TAKES_REST);
return new DeserializerOrcWriter(serDe, sourceOi, allocSize);
}
for (StructField sf : sourceOi.getAllStructFieldRefs()) {
Category c = sf.getFieldObjectInspector().getCategory();
if (c != Category.PRIMITIVE) {
LlapIoImpl.LOG.info("Not using VertorDeserializeOrcWriter: " + c + " is not supported");
return new DeserializerOrcWriter(serDe, sourceOi, allocSize);
}
}
LlapIoImpl.LOG.info("Creating VertorDeserializeOrcWriter for " + path);
return new VectorDeserializeOrcWriter(daemonConf, tblProps, sourceOi, sourceIncludes, cacheIncludes, allocSize);
}
use of org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe in project hive by apache.
the class TestSimpleMapEqualComparer method testIncompatibleType.
public void testIncompatibleType() throws SerDeException, IOException {
// empty maps
StringTextMapHolder o1 = new StringTextMapHolder();
StructObjectInspector oi1 = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(StringTextMapHolder.class, ObjectInspectorOptions.JAVA);
LazySimpleSerDe serde = new LazySimpleSerDe();
Configuration conf = new Configuration();
Properties tbl = new Properties();
tbl.setProperty(serdeConstants.LIST_COLUMNS, ObjectInspectorUtils.getFieldNames(oi1));
tbl.setProperty(serdeConstants.LIST_COLUMN_TYPES, ObjectInspectorUtils.getFieldTypes(oi1));
LazySerDeParameters serdeParams = new LazySerDeParameters(conf, tbl, LazySimpleSerDe.class.getName());
SerDeUtils.initializeSerDe(serde, conf, tbl, null);
ObjectInspector oi2 = serde.getObjectInspector();
Object o2 = serializeAndDeserialize(o1, oi1, serde, serdeParams);
int rc = ObjectInspectorUtils.compare(o1, oi1, o2, oi2, new SimpleMapEqualComparer());
assertEquals(0, rc);
// equal maps
o1.mMap.put("42", new Text("The answer to Life, Universe And Everything"));
o1.mMap.put("1729", new Text("A taxi cab number"));
o2 = serializeAndDeserialize(o1, oi1, serde, serdeParams);
rc = ObjectInspectorUtils.compare(o1, oi1, o2, oi2, new SimpleMapEqualComparer());
assertFalse(0 == rc);
}
use of org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe in project hive by apache.
the class TestCrossMapEqualComparer method testCompatibleType.
public void testCompatibleType() throws SerDeException, IOException {
// empty maps
TextStringMapHolder o1 = new TextStringMapHolder();
StructObjectInspector oi1 = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(TextStringMapHolder.class, ObjectInspectorOptions.JAVA);
LazySimpleSerDe serde = new LazySimpleSerDe();
Configuration conf = new Configuration();
Properties tbl = new Properties();
tbl.setProperty(serdeConstants.LIST_COLUMNS, ObjectInspectorUtils.getFieldNames(oi1));
tbl.setProperty(serdeConstants.LIST_COLUMN_TYPES, ObjectInspectorUtils.getFieldTypes(oi1));
LazySerDeParameters serdeParams = new LazySerDeParameters(conf, tbl, LazySimpleSerDe.class.getName());
SerDeUtils.initializeSerDe(serde, conf, tbl, null);
ObjectInspector oi2 = serde.getObjectInspector();
Object o2 = serializeAndDeserialize(o1, oi1, serde, serdeParams);
int rc = ObjectInspectorUtils.compare(o1, oi1, o2, oi2, new CrossMapEqualComparer());
assertEquals(0, rc);
// equal maps
o1.mMap.put(new Text("42"), "The answer to Life, Universe And Everything");
o1.mMap.put(new Text("1729"), "A taxi cab number");
o2 = serializeAndDeserialize(o1, oi1, serde, serdeParams);
rc = ObjectInspectorUtils.compare(o1, oi1, o2, oi2, new CrossMapEqualComparer());
assertEquals(0, rc);
// unequal maps
o1.mMap.put(new Text("1729"), "Hardy-Ramanujan Number");
rc = ObjectInspectorUtils.compare(o1, oi1, o2, oi2, new CrossMapEqualComparer());
assertFalse(0 == rc);
}
Aggregations