Search in sources :

Example 91 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project hive by apache.

the class TestOrcSplitElimination method testSplitEliminationLargeMaxSplit.

@Test
public void testSplitEliminationLargeMaxSplit() throws Exception {
    ObjectInspector inspector = createIO();
    Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector, 100000, CompressionKind.NONE, 10000, 10000);
    writeData(writer);
    writer.close();
    HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, 1000);
    HiveConf.setLongVar(conf, HiveConf.ConfVars.MAPREDMAXSPLITSIZE, 150000);
    InputFormat<?, ?> in = new OrcInputFormat();
    FileInputFormat.setInputPaths(conf, testFilePath.toString());
    GenericUDF udf = new GenericUDFOPEqualOrLessThan();
    List<ExprNodeDesc> childExpr = Lists.newArrayList();
    ExprNodeConstantDesc con;
    ExprNodeGenericFuncDesc en;
    String sargStr;
    createTestSarg(inspector, udf, childExpr);
    InputSplit[] splits = in.getSplits(conf, 1);
    assertEquals(2, splits.length);
    con = new ExprNodeConstantDesc(0);
    childExpr.set(1, con);
    en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
    sargStr = SerializationUtilities.serializeExpression(en);
    conf.set("hive.io.filter.expr.serialized", sargStr);
    splits = in.getSplits(conf, 1);
    // no stripes satisfies the condition
    assertEquals(0, splits.length);
    con = new ExprNodeConstantDesc(2);
    childExpr.set(1, con);
    en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
    sargStr = SerializationUtilities.serializeExpression(en);
    conf.set("hive.io.filter.expr.serialized", sargStr);
    splits = in.getSplits(conf, 1);
    // only first stripe will satisfy condition and hence single split
    assertEquals(1, splits.length);
    con = new ExprNodeConstantDesc(5);
    childExpr.set(1, con);
    en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
    sargStr = SerializationUtilities.serializeExpression(en);
    conf.set("hive.io.filter.expr.serialized", sargStr);
    splits = in.getSplits(conf, 1);
    // first stripe will satisfy the predicate and will be a single split, last stripe will be a
    // separate split
    assertEquals(2, splits.length);
    con = new ExprNodeConstantDesc(13);
    childExpr.set(1, con);
    en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
    sargStr = SerializationUtilities.serializeExpression(en);
    conf.set("hive.io.filter.expr.serialized", sargStr);
    splits = in.getSplits(conf, 1);
    // first 2 stripes will satisfy the predicate and merged to single split, last stripe will be a
    // separate split
    assertEquals(2, splits.length);
    con = new ExprNodeConstantDesc(29);
    childExpr.set(1, con);
    en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
    sargStr = SerializationUtilities.serializeExpression(en);
    conf.set("hive.io.filter.expr.serialized", sargStr);
    splits = in.getSplits(conf, 1);
    // first 3 stripes will satisfy the predicate and merged to single split, last stripe will be a
    // separate split
    assertEquals(2, splits.length);
    con = new ExprNodeConstantDesc(70);
    childExpr.set(1, con);
    en = new ExprNodeGenericFuncDesc(inspector, udf, childExpr);
    sargStr = SerializationUtilities.serializeExpression(en);
    conf.set("hive.io.filter.expr.serialized", sargStr);
    splits = in.getSplits(conf, 1);
    // first 2 stripes will satisfy the predicate and merged to single split, last two stripe will
    // be a separate split
    assertEquals(2, splits.length);
}
Also used : ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) ExprNodeConstantDesc(org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc) GenericUDFOPEqualOrLessThan(org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrLessThan) ExprNodeGenericFuncDesc(org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc) GenericUDF(org.apache.hadoop.hive.ql.udf.generic.GenericUDF) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) InputSplit(org.apache.hadoop.mapred.InputSplit) Test(org.junit.Test)

Example 92 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project hive by apache.

the class TestHiveAccumuloTypes method testUtf8Types.

@Test
public void testUtf8Types() throws Exception {
    final String tableName = test.getMethodName(), user = "root", pass = "";
    MockInstance mockInstance = new MockInstance(test.getMethodName());
    Connector conn = mockInstance.getConnector(user, new PasswordToken(pass));
    HiveAccumuloTableInputFormat inputformat = new HiveAccumuloTableInputFormat();
    JobConf conf = new JobConf();
    conf.set(AccumuloSerDeParameters.TABLE_NAME, tableName);
    conf.set(AccumuloSerDeParameters.USE_MOCK_INSTANCE, "true");
    conf.set(AccumuloSerDeParameters.INSTANCE_NAME, test.getMethodName());
    conf.set(AccumuloSerDeParameters.USER_NAME, user);
    conf.set(AccumuloSerDeParameters.USER_PASS, pass);
    // not used for mock, but
    conf.set(AccumuloSerDeParameters.ZOOKEEPERS, "localhost:2181");
    // required by input format.
    conf.set(AccumuloSerDeParameters.COLUMN_MAPPINGS, AccumuloHiveConstants.ROWID + ",cf:string,cf:boolean,cf:tinyint,cf:smallint,cf:int,cf:bigint" + ",cf:float,cf:double,cf:decimal,cf:date,cf:timestamp,cf:char,cf:varchar");
    conf.set(serdeConstants.LIST_COLUMNS, "string,string,boolean,tinyint,smallint,int,bigint,float,double,decimal,date,timestamp,char(4),varchar(7)");
    conf.set(serdeConstants.LIST_COLUMN_TYPES, "string,string,boolean,tinyint,smallint,int,bigint,float,double,decimal,date,timestamp,char(4),varchar(7)");
    conn.tableOperations().create(tableName);
    BatchWriterConfig writerConf = new BatchWriterConfig();
    BatchWriter writer = conn.createBatchWriter(tableName, writerConf);
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    String cf = "cf";
    byte[] cfBytes = cf.getBytes();
    ByteArrayRef byteRef = new ByteArrayRef();
    Mutation m = new Mutation("row1");
    // string
    String stringValue = "string";
    baos.reset();
    JavaStringObjectInspector stringOI = (JavaStringObjectInspector) PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(TypeInfoFactory.getPrimitiveTypeInfo(serdeConstants.STRING_TYPE_NAME));
    LazyUtils.writePrimitiveUTF8(baos, stringOI.create(stringValue), stringOI, false, (byte) 0, null);
    m.put(cfBytes, "string".getBytes(), baos.toByteArray());
    // boolean
    boolean booleanValue = true;
    baos.reset();
    JavaBooleanObjectInspector booleanOI = (JavaBooleanObjectInspector) PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(TypeInfoFactory.getPrimitiveTypeInfo(serdeConstants.BOOLEAN_TYPE_NAME));
    LazyUtils.writePrimitiveUTF8(baos, booleanOI.create(booleanValue), booleanOI, false, (byte) 0, null);
    m.put(cfBytes, "boolean".getBytes(), baos.toByteArray());
    // tinyint
    byte tinyintValue = -127;
    baos.reset();
    JavaByteObjectInspector byteOI = (JavaByteObjectInspector) PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(TypeInfoFactory.getPrimitiveTypeInfo(serdeConstants.TINYINT_TYPE_NAME));
    LazyUtils.writePrimitiveUTF8(baos, tinyintValue, byteOI, false, (byte) 0, null);
    m.put(cfBytes, "tinyint".getBytes(), baos.toByteArray());
    // smallint
    short smallintValue = Short.MAX_VALUE;
    baos.reset();
    JavaShortObjectInspector shortOI = (JavaShortObjectInspector) PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(TypeInfoFactory.getPrimitiveTypeInfo(serdeConstants.SMALLINT_TYPE_NAME));
    LazyUtils.writePrimitiveUTF8(baos, smallintValue, shortOI, false, (byte) 0, null);
    m.put(cfBytes, "smallint".getBytes(), baos.toByteArray());
    // int
    int intValue = Integer.MAX_VALUE;
    baos.reset();
    JavaIntObjectInspector intOI = (JavaIntObjectInspector) PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(TypeInfoFactory.getPrimitiveTypeInfo(serdeConstants.INT_TYPE_NAME));
    LazyUtils.writePrimitiveUTF8(baos, intValue, intOI, false, (byte) 0, null);
    m.put(cfBytes, "int".getBytes(), baos.toByteArray());
    // bigint
    long bigintValue = Long.MAX_VALUE;
    baos.reset();
    JavaLongObjectInspector longOI = (JavaLongObjectInspector) PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(TypeInfoFactory.getPrimitiveTypeInfo(serdeConstants.BIGINT_TYPE_NAME));
    LazyUtils.writePrimitiveUTF8(baos, bigintValue, longOI, false, (byte) 0, null);
    m.put(cfBytes, "bigint".getBytes(), baos.toByteArray());
    // float
    float floatValue = Float.MAX_VALUE;
    baos.reset();
    JavaFloatObjectInspector floatOI = (JavaFloatObjectInspector) PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(TypeInfoFactory.getPrimitiveTypeInfo(serdeConstants.FLOAT_TYPE_NAME));
    LazyUtils.writePrimitiveUTF8(baos, floatValue, floatOI, false, (byte) 0, null);
    m.put(cfBytes, "float".getBytes(), baos.toByteArray());
    // double
    double doubleValue = Double.MAX_VALUE;
    baos.reset();
    JavaDoubleObjectInspector doubleOI = (JavaDoubleObjectInspector) PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(TypeInfoFactory.getPrimitiveTypeInfo(serdeConstants.DOUBLE_TYPE_NAME));
    LazyUtils.writePrimitiveUTF8(baos, doubleValue, doubleOI, false, (byte) 0, null);
    m.put(cfBytes, "double".getBytes(), baos.toByteArray());
    // decimal
    HiveDecimal decimalValue = HiveDecimal.create("1.23");
    baos.reset();
    JavaHiveDecimalObjectInspector decimalOI = (JavaHiveDecimalObjectInspector) PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(new DecimalTypeInfo(5, 2));
    LazyUtils.writePrimitiveUTF8(baos, decimalOI.create(decimalValue), decimalOI, false, (byte) 0, null);
    m.put(cfBytes, "decimal".getBytes(), baos.toByteArray());
    // date
    Date now = new Date(System.currentTimeMillis());
    DateWritable dateWritable = new DateWritable(now);
    Date dateValue = dateWritable.get();
    baos.reset();
    JavaDateObjectInspector dateOI = (JavaDateObjectInspector) PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(TypeInfoFactory.getPrimitiveTypeInfo(serdeConstants.DATE_TYPE_NAME));
    LazyUtils.writePrimitiveUTF8(baos, dateOI.create(dateValue), dateOI, false, (byte) 0, null);
    m.put(cfBytes, "date".getBytes(), baos.toByteArray());
    // timestamp
    Timestamp timestampValue = new Timestamp(now.getTime());
    baos.reset();
    JavaTimestampObjectInspector timestampOI = (JavaTimestampObjectInspector) PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(TypeInfoFactory.getPrimitiveTypeInfo(serdeConstants.TIMESTAMP_TYPE_NAME));
    LazyUtils.writePrimitiveUTF8(baos, timestampOI.create(timestampValue), timestampOI, false, (byte) 0, null);
    m.put(cfBytes, "timestamp".getBytes(), baos.toByteArray());
    // char
    baos.reset();
    HiveChar charValue = new HiveChar("char", 4);
    JavaHiveCharObjectInspector charOI = (JavaHiveCharObjectInspector) PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(new CharTypeInfo(4));
    LazyUtils.writePrimitiveUTF8(baos, charOI.create(charValue), charOI, false, (byte) 0, null);
    m.put(cfBytes, "char".getBytes(), baos.toByteArray());
    // varchar
    baos.reset();
    HiveVarchar varcharValue = new HiveVarchar("varchar", 7);
    JavaHiveVarcharObjectInspector varcharOI = (JavaHiveVarcharObjectInspector) PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(new VarcharTypeInfo(7));
    LazyUtils.writePrimitiveUTF8(baos, varcharOI.create(varcharValue), varcharOI, false, (byte) 0, null);
    m.put(cfBytes, "varchar".getBytes(), baos.toByteArray());
    writer.addMutation(m);
    writer.close();
    for (Entry<Key, Value> e : conn.createScanner(tableName, new Authorizations())) {
        System.out.println(e);
    }
    // Create the RecordReader
    FileInputFormat.addInputPath(conf, new Path("unused"));
    InputSplit[] splits = inputformat.getSplits(conf, 0);
    assertEquals(splits.length, 1);
    RecordReader<Text, AccumuloHiveRow> reader = inputformat.getRecordReader(splits[0], conf, null);
    Text key = reader.createKey();
    AccumuloHiveRow value = reader.createValue();
    reader.next(key, value);
    Assert.assertEquals(13, value.getTuples().size());
    // string
    Text cfText = new Text(cf), cqHolder = new Text();
    cqHolder.set("string");
    byte[] valueBytes = value.getValue(cfText, cqHolder);
    Assert.assertNotNull(valueBytes);
    byteRef.setData(valueBytes);
    LazyStringObjectInspector lazyStringOI = LazyPrimitiveObjectInspectorFactory.getLazyStringObjectInspector(false, (byte) 0);
    LazyString lazyString = (LazyString) LazyFactory.createLazyObject(lazyStringOI);
    lazyString.init(byteRef, 0, valueBytes.length);
    Assert.assertEquals(new Text(stringValue), lazyString.getWritableObject());
    // boolean
    cqHolder.set("boolean");
    valueBytes = value.getValue(cfText, cqHolder);
    Assert.assertNotNull(valueBytes);
    byteRef.setData(valueBytes);
    LazyBooleanObjectInspector lazyBooleanOI = (LazyBooleanObjectInspector) LazyPrimitiveObjectInspectorFactory.getLazyObjectInspector(TypeInfoFactory.getPrimitiveTypeInfo(serdeConstants.BOOLEAN_TYPE_NAME));
    LazyBoolean lazyBoolean = (LazyBoolean) LazyFactory.createLazyObject(lazyBooleanOI);
    lazyBoolean.init(byteRef, 0, valueBytes.length);
    Assert.assertEquals(booleanValue, lazyBoolean.getWritableObject().get());
    // tinyint
    cqHolder.set("tinyint");
    valueBytes = value.getValue(cfText, cqHolder);
    Assert.assertNotNull(valueBytes);
    byteRef.setData(valueBytes);
    LazyByteObjectInspector lazyByteOI = (LazyByteObjectInspector) LazyPrimitiveObjectInspectorFactory.getLazyObjectInspector(TypeInfoFactory.getPrimitiveTypeInfo(serdeConstants.TINYINT_TYPE_NAME));
    LazyByte lazyByte = (LazyByte) LazyFactory.createLazyObject(lazyByteOI);
    lazyByte.init(byteRef, 0, valueBytes.length);
    Assert.assertEquals(tinyintValue, lazyByte.getWritableObject().get());
    // smallint
    cqHolder.set("smallint");
    valueBytes = value.getValue(cfText, cqHolder);
    Assert.assertNotNull(valueBytes);
    byteRef.setData(valueBytes);
    LazyShortObjectInspector lazyShortOI = (LazyShortObjectInspector) LazyPrimitiveObjectInspectorFactory.getLazyObjectInspector(TypeInfoFactory.getPrimitiveTypeInfo(serdeConstants.SMALLINT_TYPE_NAME));
    LazyShort lazyShort = (LazyShort) LazyFactory.createLazyObject(lazyShortOI);
    lazyShort.init(byteRef, 0, valueBytes.length);
    Assert.assertEquals(smallintValue, lazyShort.getWritableObject().get());
    // int
    cqHolder.set("int");
    valueBytes = value.getValue(cfText, cqHolder);
    Assert.assertNotNull(valueBytes);
    byteRef.setData(valueBytes);
    LazyIntObjectInspector lazyIntOI = (LazyIntObjectInspector) LazyPrimitiveObjectInspectorFactory.getLazyObjectInspector(TypeInfoFactory.getPrimitiveTypeInfo(serdeConstants.INT_TYPE_NAME));
    LazyInteger lazyInt = (LazyInteger) LazyFactory.createLazyObject(lazyIntOI);
    lazyInt.init(byteRef, 0, valueBytes.length);
    Assert.assertEquals(intValue, lazyInt.getWritableObject().get());
    // bigint
    cqHolder.set("bigint");
    valueBytes = value.getValue(cfText, cqHolder);
    Assert.assertNotNull(valueBytes);
    byteRef.setData(valueBytes);
    LazyLongObjectInspector lazyLongOI = (LazyLongObjectInspector) LazyPrimitiveObjectInspectorFactory.getLazyObjectInspector(TypeInfoFactory.getPrimitiveTypeInfo(serdeConstants.BIGINT_TYPE_NAME));
    LazyLong lazyLong = (LazyLong) LazyFactory.createLazyObject(lazyLongOI);
    lazyLong.init(byteRef, 0, valueBytes.length);
    Assert.assertEquals(bigintValue, lazyLong.getWritableObject().get());
    // float
    cqHolder.set("float");
    valueBytes = value.getValue(cfText, cqHolder);
    Assert.assertNotNull(valueBytes);
    byteRef.setData(valueBytes);
    LazyFloatObjectInspector lazyFloatOI = (LazyFloatObjectInspector) LazyPrimitiveObjectInspectorFactory.getLazyObjectInspector(TypeInfoFactory.getPrimitiveTypeInfo(serdeConstants.FLOAT_TYPE_NAME));
    LazyFloat lazyFloat = (LazyFloat) LazyFactory.createLazyObject(lazyFloatOI);
    lazyFloat.init(byteRef, 0, valueBytes.length);
    Assert.assertEquals(floatValue, lazyFloat.getWritableObject().get(), 0);
    // double
    cqHolder.set("double");
    valueBytes = value.getValue(cfText, cqHolder);
    Assert.assertNotNull(valueBytes);
    byteRef.setData(valueBytes);
    LazyDoubleObjectInspector lazyDoubleOI = (LazyDoubleObjectInspector) LazyPrimitiveObjectInspectorFactory.getLazyObjectInspector(TypeInfoFactory.getPrimitiveTypeInfo(serdeConstants.DOUBLE_TYPE_NAME));
    LazyDouble lazyDouble = (LazyDouble) LazyFactory.createLazyObject(lazyDoubleOI);
    lazyDouble.init(byteRef, 0, valueBytes.length);
    Assert.assertEquals(doubleValue, lazyDouble.getWritableObject().get(), 0);
    // decimal
    cqHolder.set("decimal");
    valueBytes = value.getValue(cfText, cqHolder);
    Assert.assertNotNull(valueBytes);
    byteRef.setData(valueBytes);
    LazyHiveDecimalObjectInspector lazyDecimalOI = (LazyHiveDecimalObjectInspector) LazyPrimitiveObjectInspectorFactory.getLazyObjectInspector(new DecimalTypeInfo(5, 2));
    LazyHiveDecimal lazyDecimal = (LazyHiveDecimal) LazyFactory.createLazyObject(lazyDecimalOI);
    lazyDecimal.init(byteRef, 0, valueBytes.length);
    Assert.assertEquals(decimalValue, lazyDecimal.getWritableObject().getHiveDecimal());
    // date
    cqHolder.set("date");
    valueBytes = value.getValue(cfText, cqHolder);
    Assert.assertNotNull(valueBytes);
    byteRef.setData(valueBytes);
    LazyDateObjectInspector lazyDateOI = (LazyDateObjectInspector) LazyPrimitiveObjectInspectorFactory.getLazyObjectInspector(TypeInfoFactory.getPrimitiveTypeInfo(serdeConstants.DATE_TYPE_NAME));
    LazyDate lazyDate = (LazyDate) LazyFactory.createLazyObject(lazyDateOI);
    lazyDate.init(byteRef, 0, valueBytes.length);
    Assert.assertEquals(dateValue, lazyDate.getWritableObject().get());
    // timestamp
    cqHolder.set("timestamp");
    valueBytes = value.getValue(cfText, cqHolder);
    Assert.assertNotNull(valueBytes);
    byteRef.setData(valueBytes);
    LazyTimestampObjectInspector lazyTimestampOI = (LazyTimestampObjectInspector) LazyPrimitiveObjectInspectorFactory.getLazyObjectInspector(TypeInfoFactory.getPrimitiveTypeInfo(serdeConstants.TIMESTAMP_TYPE_NAME));
    LazyTimestamp lazyTimestamp = (LazyTimestamp) LazyFactory.createLazyObject(lazyTimestampOI);
    lazyTimestamp.init(byteRef, 0, valueBytes.length);
    Assert.assertEquals(timestampValue, lazyTimestamp.getWritableObject().getTimestamp());
    // char
    cqHolder.set("char");
    valueBytes = value.getValue(cfText, cqHolder);
    Assert.assertNotNull(valueBytes);
    byteRef.setData(valueBytes);
    LazyHiveCharObjectInspector lazyCharOI = (LazyHiveCharObjectInspector) LazyPrimitiveObjectInspectorFactory.getLazyObjectInspector(new CharTypeInfo(4));
    LazyHiveChar lazyChar = (LazyHiveChar) LazyFactory.createLazyObject(lazyCharOI);
    lazyChar.init(byteRef, 0, valueBytes.length);
    Assert.assertEquals(charValue, lazyChar.getWritableObject().getHiveChar());
    // varchar
    cqHolder.set("varchar");
    valueBytes = value.getValue(cfText, cqHolder);
    Assert.assertNotNull(valueBytes);
    byteRef.setData(valueBytes);
    LazyHiveVarcharObjectInspector lazyVarcharOI = (LazyHiveVarcharObjectInspector) LazyPrimitiveObjectInspectorFactory.getLazyObjectInspector(new VarcharTypeInfo(7));
    LazyHiveVarchar lazyVarchar = (LazyHiveVarchar) LazyFactory.createLazyObject(lazyVarcharOI);
    lazyVarchar.init(byteRef, 0, valueBytes.length);
    Assert.assertEquals(varcharValue.toString(), lazyVarchar.getWritableObject().getHiveVarchar().toString());
}
Also used : LazyHiveVarchar(org.apache.hadoop.hive.serde2.lazy.LazyHiveVarchar) VarcharTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo) LazyHiveDecimal(org.apache.hadoop.hive.serde2.lazy.LazyHiveDecimal) LazyIntObjectInspector(org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyIntObjectInspector) LazyDoubleObjectInspector(org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyDoubleObjectInspector) LazyString(org.apache.hadoop.hive.serde2.lazy.LazyString) AccumuloHiveRow(org.apache.hadoop.hive.accumulo.AccumuloHiveRow) LazyHiveChar(org.apache.hadoop.hive.serde2.lazy.LazyHiveChar) PasswordToken(org.apache.accumulo.core.client.security.tokens.PasswordToken) BatchWriterConfig(org.apache.accumulo.core.client.BatchWriterConfig) JobConf(org.apache.hadoop.mapred.JobConf) JavaHiveCharObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.JavaHiveCharObjectInspector) LazyShortObjectInspector(org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyShortObjectInspector) Authorizations(org.apache.accumulo.core.security.Authorizations) LazyLong(org.apache.hadoop.hive.serde2.lazy.LazyLong) LazyStringObjectInspector(org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyStringObjectInspector) JavaStringObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.JavaStringObjectInspector) LazyDateObjectInspector(org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyDateObjectInspector) CharTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo) JavaLongObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.JavaLongObjectInspector) LazyHiveVarcharObjectInspector(org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyHiveVarcharObjectInspector) JavaTimestampObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.JavaTimestampObjectInspector) LazyBoolean(org.apache.hadoop.hive.serde2.lazy.LazyBoolean) LazyLongObjectInspector(org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyLongObjectInspector) DecimalTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo) LazyByte(org.apache.hadoop.hive.serde2.lazy.LazyByte) JavaHiveVarcharObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.JavaHiveVarcharObjectInspector) ByteArrayRef(org.apache.hadoop.hive.serde2.lazy.ByteArrayRef) JavaFloatObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.JavaFloatObjectInspector) Value(org.apache.accumulo.core.data.Value) LazyInteger(org.apache.hadoop.hive.serde2.lazy.LazyInteger) Mutation(org.apache.accumulo.core.data.Mutation) LazyDouble(org.apache.hadoop.hive.serde2.lazy.LazyDouble) Key(org.apache.accumulo.core.data.Key) Connector(org.apache.accumulo.core.client.Connector) LazyHiveCharObjectInspector(org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyHiveCharObjectInspector) JavaIntObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.JavaIntObjectInspector) LazyTimestamp(org.apache.hadoop.hive.serde2.lazy.LazyTimestamp) HiveChar(org.apache.hadoop.hive.common.type.HiveChar) LazyHiveChar(org.apache.hadoop.hive.serde2.lazy.LazyHiveChar) LazyBooleanObjectInspector(org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyBooleanObjectInspector) LazyFloat(org.apache.hadoop.hive.serde2.lazy.LazyFloat) Timestamp(java.sql.Timestamp) LazyTimestamp(org.apache.hadoop.hive.serde2.lazy.LazyTimestamp) JavaDoubleObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.JavaDoubleObjectInspector) LazyShort(org.apache.hadoop.hive.serde2.lazy.LazyShort) LazyString(org.apache.hadoop.hive.serde2.lazy.LazyString) JavaHiveDecimalObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.JavaHiveDecimalObjectInspector) MockInstance(org.apache.accumulo.core.client.mock.MockInstance) JavaDateObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.JavaDateObjectInspector) JavaByteObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.JavaByteObjectInspector) LazyHiveDecimal(org.apache.hadoop.hive.serde2.lazy.LazyHiveDecimal) HiveDecimal(org.apache.hadoop.hive.common.type.HiveDecimal) JavaBooleanObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.JavaBooleanObjectInspector) InputSplit(org.apache.hadoop.mapred.InputSplit) Path(org.apache.hadoop.fs.Path) DateWritable(org.apache.hadoop.hive.serde2.io.DateWritable) Text(org.apache.hadoop.io.Text) ByteArrayOutputStream(java.io.ByteArrayOutputStream) HiveVarchar(org.apache.hadoop.hive.common.type.HiveVarchar) LazyHiveVarchar(org.apache.hadoop.hive.serde2.lazy.LazyHiveVarchar) LazyFloatObjectInspector(org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyFloatObjectInspector) LazyTimestampObjectInspector(org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyTimestampObjectInspector) LazyDate(org.apache.hadoop.hive.serde2.lazy.LazyDate) LazyDate(org.apache.hadoop.hive.serde2.lazy.LazyDate) Date(java.sql.Date) LazyHiveDecimalObjectInspector(org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyHiveDecimalObjectInspector) LazyByteObjectInspector(org.apache.hadoop.hive.serde2.lazy.objectinspector.primitive.LazyByteObjectInspector) BatchWriter(org.apache.accumulo.core.client.BatchWriter) JavaShortObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.JavaShortObjectInspector) Test(org.junit.Test)

Example 93 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project hive by apache.

the class HiveSplitGenerator method pruneBuckets.

private InputSplit[] pruneBuckets(MapWork work, InputSplit[] splits) {
    final BitSet buckets = work.getIncludedBuckets();
    final String bucketIn = buckets.toString();
    List<InputSplit> filteredSplits = new ArrayList<InputSplit>(splits.length / 2);
    for (InputSplit split : splits) {
        final int bucket = Utilities.parseSplitBucket(split);
        if (bucket < 0 || buckets.get(bucket)) {
            // match or UNKNOWN
            filteredSplits.add(split);
        } else {
            LOG.info("Pruning with IN ({}) - removing {}", bucketIn, split);
        }
    }
    if (filteredSplits.size() < splits.length) {
        // reallocate only if any filters pruned
        splits = filteredSplits.toArray(new InputSplit[filteredSplits.size()]);
    }
    return splits;
}
Also used : BitSet(java.util.BitSet) ArrayList(java.util.ArrayList) InputSplit(org.apache.hadoop.mapred.InputSplit) TaskLocationHint(org.apache.tez.dag.api.TaskLocationHint) VertexLocationHint(org.apache.tez.dag.api.VertexLocationHint)

Example 94 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project hive by apache.

the class CustomPartitionVertex method getBucketSplitMapForPath.

/*
   * This method generates the map of bucket to file splits.
   */
private Multimap<Integer, InputSplit> getBucketSplitMapForPath(Map<String, Set<FileSplit>> pathFileSplitsMap) {
    int bucketNum = 0;
    Multimap<Integer, InputSplit> bucketToInitialSplitMap = ArrayListMultimap.<Integer, InputSplit>create();
    for (Map.Entry<String, Set<FileSplit>> entry : pathFileSplitsMap.entrySet()) {
        int bucketId = bucketNum % numBuckets;
        for (FileSplit fsplit : entry.getValue()) {
            bucketToInitialSplitMap.put(bucketId, fsplit);
        }
        bucketNum++;
    }
    // well.
    if (bucketNum < numBuckets) {
        int loopedBucketId = 0;
        for (; bucketNum < numBuckets; bucketNum++) {
            for (InputSplit fsplit : bucketToInitialSplitMap.get(loopedBucketId)) {
                bucketToInitialSplitMap.put(bucketNum, fsplit);
            }
            loopedBucketId++;
        }
    }
    return bucketToInitialSplitMap;
}
Also used : Set(java.util.Set) TreeSet(java.util.TreeSet) ByteString(com.google.protobuf.ByteString) FileSplit(org.apache.hadoop.mapred.FileSplit) InputSplit(org.apache.hadoop.mapred.InputSplit) Map(java.util.Map) HashMap(java.util.HashMap) TreeMap(java.util.TreeMap) VertexLocationHint(org.apache.tez.dag.api.VertexLocationHint)

Example 95 with InputSplit

use of org.apache.hadoop.mapred.InputSplit in project hive by apache.

the class StatsNoJobTask method aggregateStats.

private int aggregateStats(ExecutorService threadPool, Hive db) {
    int ret = 0;
    try {
        Collection<Partition> partitions = null;
        if (work.getPrunedPartitionList() == null) {
            partitions = getPartitionsList();
        } else {
            partitions = work.getPrunedPartitionList().getPartitions();
        }
        // non-partitioned table
        if (partitions == null) {
            org.apache.hadoop.hive.metastore.api.Table tTable = table.getTTable();
            Map<String, String> parameters = tTable.getParameters();
            try {
                Path dir = new Path(tTable.getSd().getLocation());
                long numRows = 0;
                long rawDataSize = 0;
                long fileSize = 0;
                long numFiles = 0;
                FileSystem fs = dir.getFileSystem(conf);
                FileStatus[] fileList = HiveStatsUtils.getFileStatusRecurse(dir, -1, fs);
                boolean statsAvailable = false;
                for (FileStatus file : fileList) {
                    if (!file.isDir()) {
                        InputFormat<?, ?> inputFormat = ReflectionUtil.newInstance(table.getInputFormatClass(), jc);
                        InputSplit dummySplit = new FileSplit(file.getPath(), 0, 0, new String[] { table.getDataLocation().toString() });
                        if (file.getLen() == 0) {
                            numFiles += 1;
                            statsAvailable = true;
                        } else {
                            org.apache.hadoop.mapred.RecordReader<?, ?> recordReader = inputFormat.getRecordReader(dummySplit, jc, Reporter.NULL);
                            StatsProvidingRecordReader statsRR;
                            if (recordReader instanceof StatsProvidingRecordReader) {
                                statsRR = (StatsProvidingRecordReader) recordReader;
                                numRows += statsRR.getStats().getRowCount();
                                rawDataSize += statsRR.getStats().getRawDataSize();
                                fileSize += file.getLen();
                                numFiles += 1;
                                statsAvailable = true;
                            }
                            recordReader.close();
                        }
                    }
                }
                if (statsAvailable) {
                    parameters.put(StatsSetupConst.ROW_COUNT, String.valueOf(numRows));
                    parameters.put(StatsSetupConst.RAW_DATA_SIZE, String.valueOf(rawDataSize));
                    parameters.put(StatsSetupConst.TOTAL_SIZE, String.valueOf(fileSize));
                    parameters.put(StatsSetupConst.NUM_FILES, String.valueOf(numFiles));
                    EnvironmentContext environmentContext = new EnvironmentContext();
                    environmentContext.putToProperties(StatsSetupConst.STATS_GENERATED, StatsSetupConst.TASK);
                    db.alterTable(tableFullName, new Table(tTable), environmentContext);
                    String msg = "Table " + tableFullName + " stats: [" + toString(parameters) + ']';
                    LOG.debug(msg);
                    console.printInfo(msg);
                } else {
                    String msg = "Table " + tableFullName + " does not provide stats.";
                    LOG.debug(msg);
                }
            } catch (Exception e) {
                console.printInfo("[Warning] could not update stats for " + tableFullName + ".", "Failed with exception " + e.getMessage() + "\n" + StringUtils.stringifyException(e));
            }
        } else {
            // Partitioned table
            for (Partition partn : partitions) {
                threadPool.execute(new StatsCollection(partn));
            }
            LOG.debug("Stats collection waiting for threadpool to shutdown..");
            shutdownAndAwaitTermination(threadPool);
            LOG.debug("Stats collection threadpool shutdown successful.");
            ret = updatePartitions(db);
        }
    } catch (Exception e) {
        // Fail the query if the stats are supposed to be reliable
        if (work.isStatsReliable()) {
            ret = -1;
        }
    }
    // anything else indicates failure
    return ret;
}
Also used : FileStatus(org.apache.hadoop.fs.FileStatus) StatsProvidingRecordReader(org.apache.hadoop.hive.ql.io.StatsProvidingRecordReader) FileSplit(org.apache.hadoop.mapred.FileSplit) EnvironmentContext(org.apache.hadoop.hive.metastore.api.EnvironmentContext) FileSystem(org.apache.hadoop.fs.FileSystem) InputSplit(org.apache.hadoop.mapred.InputSplit) Path(org.apache.hadoop.fs.Path) Partition(org.apache.hadoop.hive.ql.metadata.Partition) Table(org.apache.hadoop.hive.ql.metadata.Table) InvalidOperationException(org.apache.hadoop.hive.metastore.api.InvalidOperationException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException)

Aggregations

InputSplit (org.apache.hadoop.mapred.InputSplit)161 Path (org.apache.hadoop.fs.Path)57 JobConf (org.apache.hadoop.mapred.JobConf)56 Test (org.junit.Test)49 IOException (java.io.IOException)47 ArrayList (java.util.ArrayList)29 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)27 FileSplit (org.apache.hadoop.mapred.FileSplit)24 FileSystem (org.apache.hadoop.fs.FileSystem)21 TextInputFormat (org.apache.hadoop.mapred.TextInputFormat)21 InputFormat (org.apache.hadoop.mapred.InputFormat)19 RecordWriter (org.apache.hadoop.mapred.RecordWriter)19 NullWritable (org.apache.hadoop.io.NullWritable)18 Text (org.apache.hadoop.io.Text)18 Configuration (org.apache.hadoop.conf.Configuration)14 LongWritable (org.apache.hadoop.io.LongWritable)11 FileInputFormat (org.apache.hadoop.mapred.FileInputFormat)10 Properties (java.util.Properties)9 TaskLocationHint (org.apache.tez.dag.api.TaskLocationHint)9 HashMap (java.util.HashMap)8