Search in sources :

Example 1 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class Utilities method createDummyFileForEmptyPartition.

@SuppressWarnings("rawtypes")
private static Path createDummyFileForEmptyPartition(Path path, JobConf job, MapWork work, Path hiveScratchDir) throws Exception {
    String strPath = path.toString();
    // The input file does not exist, replace it by a empty file
    PartitionDesc partDesc = work.getPathToPartitionInfo().get(path);
    if (partDesc.getTableDesc().isNonNative()) {
        // if this isn't a hive table we can't create an empty file for it.
        return path;
    }
    Properties props = SerDeUtils.createOverlayedProperties(partDesc.getTableDesc().getProperties(), partDesc.getProperties());
    HiveOutputFormat outFileFormat = HiveFileFormatUtils.getHiveOutputFormat(job, partDesc);
    boolean oneRow = partDesc.getInputFileFormatClass() == OneNullRowInputFormat.class;
    Path newPath = createEmptyFile(hiveScratchDir, outFileFormat, job, props, oneRow);
    if (LOG.isInfoEnabled()) {
        LOG.info("Changed input file " + strPath + " to empty file " + newPath + " (" + oneRow + ")");
    }
    // update the work
    work.addPathToAlias(newPath, work.getPathToAliases().get(path));
    work.removePathToAlias(path);
    work.removePathToPartitionInfo(path);
    work.addPathToPartitionInfo(newPath, partDesc);
    return newPath;
}
Also used : Path(org.apache.hadoop.fs.Path) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) HiveOutputFormat(org.apache.hadoop.hive.ql.io.HiveOutputFormat) Properties(java.util.Properties)

Example 2 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class DynamicPartitionPruner method applyFilterToPartitions.

@SuppressWarnings("rawtypes")
private void applyFilterToPartitions(Converter converter, ExprNodeEvaluator eval, String columnName, Set<Object> values) throws HiveException {
    Object[] row = new Object[1];
    Iterator<Path> it = work.getPathToPartitionInfo().keySet().iterator();
    while (it.hasNext()) {
        Path p = it.next();
        PartitionDesc desc = work.getPathToPartitionInfo().get(p);
        Map<String, String> spec = desc.getPartSpec();
        if (spec == null) {
            throw new IllegalStateException("No partition spec found in dynamic pruning");
        }
        String partValueString = spec.get(columnName);
        if (partValueString == null) {
            throw new IllegalStateException("Could not find partition value for column: " + columnName);
        }
        Object partValue = converter.convert(partValueString);
        if (LOG.isDebugEnabled()) {
            LOG.debug("Converted partition value: " + partValue + " original (" + partValueString + ")");
        }
        row[0] = partValue;
        partValue = eval.evaluate(row);
        if (LOG.isDebugEnabled()) {
            LOG.debug("part key expr applied: " + partValue);
        }
        if (!values.contains(partValue)) {
            LOG.info("Pruning path: " + p);
            it.remove();
            // work.removePathToPartitionInfo(p);
            work.removePathToAlias(p);
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc)

Example 3 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class TestUtilities method runTestGetInputSummary.

private ContentSummary runTestGetInputSummary(JobConf jobConf, Properties properties, int numOfPartitions, int bytesPerFile, Class<? extends InputFormat> inputFormatClass) throws IOException {
    // creates scratch directories needed by the Context object
    SessionState.start(new HiveConf());
    MapWork mapWork = new MapWork();
    Context context = new Context(jobConf);
    LinkedHashMap<Path, PartitionDesc> pathToPartitionInfo = new LinkedHashMap<>();
    LinkedHashMap<Path, ArrayList<String>> pathToAliasTable = new LinkedHashMap<>();
    TableScanOperator scanOp = new TableScanOperator();
    PartitionDesc partitionDesc = new PartitionDesc(new TableDesc(inputFormatClass, null, properties), null);
    String testTableName = "testTable";
    Path testTablePath = new Path(testTableName);
    Path[] testPartitionsPaths = new Path[numOfPartitions];
    for (int i = 0; i < numOfPartitions; i++) {
        String testPartitionName = "p=" + 1;
        testPartitionsPaths[i] = new Path(testTablePath, "p=" + i);
        pathToPartitionInfo.put(testPartitionsPaths[i], partitionDesc);
        pathToAliasTable.put(testPartitionsPaths[i], Lists.newArrayList(testPartitionName));
        mapWork.getAliasToWork().put(testPartitionName, scanOp);
    }
    mapWork.setPathToAliases(pathToAliasTable);
    mapWork.setPathToPartitionInfo(pathToPartitionInfo);
    FileSystem fs = FileSystem.getLocal(jobConf);
    try {
        fs.mkdirs(testTablePath);
        byte[] data = new byte[bytesPerFile];
        for (int i = 0; i < numOfPartitions; i++) {
            fs.mkdirs(testPartitionsPaths[i]);
            FSDataOutputStream out = fs.create(new Path(testPartitionsPaths[i], "test1.txt"));
            out.write(data);
            out.close();
        }
        return Utilities.getInputSummary(context, mapWork, null);
    } finally {
        if (fs.exists(testTablePath)) {
            fs.delete(testTablePath, true);
        }
    }
}
Also used : Context(org.apache.hadoop.hive.ql.Context) Path(org.apache.hadoop.fs.Path) ArrayList(java.util.ArrayList) LinkedHashMap(java.util.LinkedHashMap) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) FileSystem(org.apache.hadoop.fs.FileSystem) HiveConf(org.apache.hadoop.hive.conf.HiveConf) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream)

Example 4 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class TestUtilities method testGetInputPathsWithEmptyTables.

/**
   * Check that calling {@link Utilities#getInputPaths(JobConf, MapWork, Path, Context, boolean)}
   * can process two different empty tables without throwing any exceptions.
   */
@Test
public void testGetInputPathsWithEmptyTables() throws Exception {
    String alias1Name = "alias1";
    String alias2Name = "alias2";
    MapWork mapWork1 = new MapWork();
    MapWork mapWork2 = new MapWork();
    JobConf jobConf = new JobConf();
    Path nonExistentPath1 = new Path(UUID.randomUUID().toString());
    Path nonExistentPath2 = new Path(UUID.randomUUID().toString());
    PartitionDesc mockPartitionDesc = mock(PartitionDesc.class);
    TableDesc mockTableDesc = mock(TableDesc.class);
    when(mockTableDesc.isNonNative()).thenReturn(false);
    when(mockTableDesc.getProperties()).thenReturn(new Properties());
    when(mockPartitionDesc.getProperties()).thenReturn(new Properties());
    when(mockPartitionDesc.getTableDesc()).thenReturn(mockTableDesc);
    doReturn(HiveSequenceFileOutputFormat.class).when(mockPartitionDesc).getOutputFileFormatClass();
    mapWork1.setPathToAliases(new LinkedHashMap<>(ImmutableMap.of(nonExistentPath1, Lists.newArrayList(alias1Name))));
    mapWork1.setAliasToWork(new LinkedHashMap<String, Operator<? extends OperatorDesc>>(ImmutableMap.of(alias1Name, (Operator<?>) mock(Operator.class))));
    mapWork1.setPathToPartitionInfo(new LinkedHashMap<>(ImmutableMap.of(nonExistentPath1, mockPartitionDesc)));
    mapWork2.setPathToAliases(new LinkedHashMap<>(ImmutableMap.of(nonExistentPath2, Lists.newArrayList(alias2Name))));
    mapWork2.setAliasToWork(new LinkedHashMap<String, Operator<? extends OperatorDesc>>(ImmutableMap.of(alias2Name, (Operator<?>) mock(Operator.class))));
    mapWork2.setPathToPartitionInfo(new LinkedHashMap<>(ImmutableMap.of(nonExistentPath2, mockPartitionDesc)));
    List<Path> inputPaths = new ArrayList<>();
    try {
        Path scratchDir = new Path(HiveConf.getVar(jobConf, HiveConf.ConfVars.LOCALSCRATCHDIR));
        inputPaths.addAll(Utilities.getInputPaths(jobConf, mapWork1, scratchDir, mock(Context.class), false));
        inputPaths.addAll(Utilities.getInputPaths(jobConf, mapWork2, scratchDir, mock(Context.class), false));
        assertEquals(inputPaths.size(), 2);
    } finally {
        File file;
        for (Path path : inputPaths) {
            file = new File(path.toString());
            if (file.exists()) {
                file.delete();
            }
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) ArrayList(java.util.ArrayList) Properties(java.util.Properties) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) JobConf(org.apache.hadoop.mapred.JobConf) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) File(java.io.File) Test(org.junit.Test)

Example 5 with PartitionDesc

use of org.apache.hadoop.hive.ql.plan.PartitionDesc in project hive by apache.

the class VectorMapOperator method setupPartitionContextVars.

/*
   * Setup the context for reading from the next partition file.
   */
private void setupPartitionContextVars(String nominalPath) throws HiveException {
    currentVectorPartContext = fileToPartitionContextMap.get(nominalPath);
    if (currentVectorPartContext == null) {
        return;
    }
    PartitionDesc partDesc = currentVectorPartContext.getPartDesc();
    VectorPartitionDesc vectorPartDesc = partDesc.getVectorPartitionDesc();
    currentReadType = vectorPartDesc.getVectorMapOperatorReadType();
    /*
     * Setup for 3 different kinds of vectorized reading supported:
     *
     *   1) Read the Vectorized Input File Format which returns VectorizedRowBatch as the row.
     *
     *   2) Read using VectorDeserializeRow to deserialize each row into the VectorizedRowBatch.
     *
     *   3) And read using the regular partition deserializer to get the row object and assigning
     *      the row object into the VectorizedRowBatch with VectorAssignRow.
     */
    if (currentReadType == VectorMapOperatorReadType.VECTORIZED_INPUT_FILE_FORMAT) {
        /*
       * The Vectorized Input File Format reader is responsible for setting the partition column
       * values, resetting and filling in the batch, etc.
       */
        /*
       * Clear all the reading variables.
       */
        currentDataColumnCount = 0;
        currentDeserializeRead = null;
        currentVectorDeserializeRow = null;
        currentPartDeserializer = null;
        currentPartRawRowObjectInspector = null;
        currentVectorAssign = null;
    } else {
        /*
       * We will get "regular" single rows from the Input File Format reader that we will need
       * to {vector|row} deserialize.
       */
        Preconditions.checkState(currentReadType == VectorMapOperatorReadType.VECTOR_DESERIALIZE || currentReadType == VectorMapOperatorReadType.ROW_DESERIALIZE);
        /*
       * Clear out any rows in the batch from previous partition since we are going to change
       * the repeating partition column values.
       */
        if (!flushDeserializerBatch()) {
            // Operator tree is now done.
            return;
        }
        /*
       * For this particular file, how many columns will we actually read?
       */
        currentDataColumnCount = currentVectorPartContext.getReaderDataColumnCount();
        if (currentDataColumnCount < dataColumnCount) {
            /*
         * Default any additional data columns to NULL once for the file (if they are present).
         */
            for (int i = currentDataColumnCount; i < dataColumnCount; i++) {
                ColumnVector colVector = deserializerBatch.cols[i];
                if (colVector != null) {
                    colVector.isNull[0] = true;
                    colVector.noNulls = false;
                    colVector.isRepeating = true;
                }
            }
        }
        if (batchContext.getPartitionColumnCount() > 0) {
            /*
         * The partition columns are set once for the partition and are marked repeating.
         */
            VectorizedRowBatchCtx.getPartitionValues(batchContext, partDesc, partitionValues);
            batchContext.addPartitionColsToBatch(deserializerBatch, partitionValues);
        }
        if (hasRowIdentifier) {
            // No ACID in code path -- set ROW__ID to NULL.
            setRowIdentiferToNull(deserializerBatch);
        }
        /*
       * Set or clear the rest of the reading variables based on {vector|row} deserialization.
       */
        switch(currentReadType) {
            case VECTOR_DESERIALIZE:
                {
                    VectorDeserializePartitionContext vectorDeserPartContext = (VectorDeserializePartitionContext) currentVectorPartContext;
                    // Set ours.
                    currentDeserializeRead = vectorDeserPartContext.getDeserializeRead();
                    currentVectorDeserializeRow = vectorDeserPartContext.getVectorDeserializeRow();
                    // Clear the other ones.
                    currentPartDeserializer = null;
                    currentPartRawRowObjectInspector = null;
                    currentVectorAssign = null;
                }
                break;
            case ROW_DESERIALIZE:
                {
                    RowDeserializePartitionContext rowDeserPartContext = (RowDeserializePartitionContext) currentVectorPartContext;
                    // Clear the other ones.
                    currentDeserializeRead = null;
                    currentVectorDeserializeRow = null;
                    // Set ours.
                    currentPartDeserializer = rowDeserPartContext.getPartDeserializer();
                    currentPartRawRowObjectInspector = rowDeserPartContext.getPartRawRowObjectInspector();
                    currentVectorAssign = rowDeserPartContext.getVectorAssign();
                }
                break;
            default:
                throw new RuntimeException("Unexpected VectorMapOperator read type " + currentReadType.name());
        }
    }
}
Also used : VectorPartitionDesc(org.apache.hadoop.hive.ql.plan.VectorPartitionDesc) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) VectorPartitionDesc(org.apache.hadoop.hive.ql.plan.VectorPartitionDesc)

Aggregations

PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)90 Path (org.apache.hadoop.fs.Path)67 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)41 ArrayList (java.util.ArrayList)39 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)27 LinkedHashMap (java.util.LinkedHashMap)24 List (java.util.List)23 JobConf (org.apache.hadoop.mapred.JobConf)21 Map (java.util.Map)18 Properties (java.util.Properties)18 HashMap (java.util.HashMap)17 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)17 IOException (java.io.IOException)15 Operator (org.apache.hadoop.hive.ql.exec.Operator)15 MapredWork (org.apache.hadoop.hive.ql.plan.MapredWork)14 Configuration (org.apache.hadoop.conf.Configuration)13 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)13 FileSystem (org.apache.hadoop.fs.FileSystem)11 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)9 HiveInputFormat (org.apache.hadoop.hive.ql.io.HiveInputFormat)9