Search in sources :

Example 76 with Partition

use of org.apache.hadoop.hive.ql.metadata.Partition in project hive by apache.

the class GenMRTableScan1 method process.

/**
 * Table Sink encountered.
 * @param nd
 *          the table sink operator encountered
 * @param opProcCtx
 *          context
 */
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx, Object... nodeOutputs) throws SemanticException {
    TableScanOperator op = (TableScanOperator) nd;
    GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
    ParseContext parseCtx = ctx.getParseCtx();
    Table table = op.getConf().getTableMetadata();
    Class<? extends InputFormat> inputFormat = table.getInputFormatClass();
    Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = ctx.getMapCurrCtx();
    // create a dummy MapReduce task
    MapredWork currWork = GenMapRedUtils.getMapRedWork(parseCtx);
    MapRedTask currTask = (MapRedTask) TaskFactory.get(currWork);
    ctx.setCurrTask(currTask);
    ctx.setCurrTopOp(op);
    for (String alias : parseCtx.getTopOps().keySet()) {
        Operator<? extends OperatorDesc> currOp = parseCtx.getTopOps().get(alias);
        if (currOp == op) {
            String currAliasId = alias;
            ctx.setCurrAliasId(currAliasId);
            mapCurrCtx.put(op, new GenMapRedCtx(currTask, currAliasId));
            if (parseCtx.getQueryProperties().isAnalyzeCommand()) {
                boolean noScan = parseCtx.getQueryProperties().isNoScanAnalyzeCommand();
                if (OrcInputFormat.class.isAssignableFrom(inputFormat) || MapredParquetInputFormat.class.isAssignableFrom(inputFormat)) {
                    // For ORC and Parquet, all the following statements are the same
                    // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS
                    // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS noscan;
                    // There will not be any MR or Tez job above this task
                    StatsWork statWork = new StatsWork(table, parseCtx.getConf());
                    statWork.setFooterScan();
                    // If partition is specified, get pruned partition list
                    Set<Partition> confirmedParts = GenMapRedUtils.getConfirmedPartitionsForScan(op);
                    if (confirmedParts.size() > 0) {
                        List<String> partCols = GenMapRedUtils.getPartitionColumns(op);
                        PrunedPartitionList partList = new PrunedPartitionList(table, confirmedParts, partCols, false);
                        statWork.addInputPartitions(partList.getPartitions());
                    }
                    Task<StatsWork> snjTask = TaskFactory.get(statWork);
                    ctx.setCurrTask(snjTask);
                    ctx.setCurrTopOp(null);
                    ctx.getRootTasks().clear();
                    ctx.getRootTasks().add(snjTask);
                } else {
                    // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS;
                    // The plan consists of a simple MapRedTask followed by a StatsTask.
                    // The MR task is just a simple TableScanOperator
                    BasicStatsWork statsWork = new BasicStatsWork(table.getTableSpec());
                    statsWork.setNoScanAnalyzeCommand(noScan);
                    StatsWork columnStatsWork = new StatsWork(table, statsWork, parseCtx.getConf());
                    columnStatsWork.collectStatsFromAggregator(op.getConf());
                    columnStatsWork.setSourceTask(currTask);
                    Task<StatsWork> columnStatsTask = TaskFactory.get(columnStatsWork);
                    currTask.addDependentTask(columnStatsTask);
                    if (!ctx.getRootTasks().contains(currTask)) {
                        ctx.getRootTasks().add(currTask);
                    }
                    // The plan consists of a StatsTask only.
                    if (noScan) {
                        columnStatsTask.setParentTasks(null);
                        ctx.getRootTasks().remove(currTask);
                        ctx.getRootTasks().add(columnStatsTask);
                    }
                    currWork.getMapWork().setGatheringStats(true);
                    if (currWork.getReduceWork() != null) {
                        currWork.getReduceWork().setGatheringStats(true);
                    }
                    // NOTE: here we should use the new partition predicate pushdown API to get a list of
                    // pruned list,
                    // and pass it to setTaskPlan as the last parameter
                    Set<Partition> confirmedPartns = GenMapRedUtils.getConfirmedPartitionsForScan(op);
                    if (confirmedPartns.size() > 0) {
                        List<String> partCols = GenMapRedUtils.getPartitionColumns(op);
                        PrunedPartitionList partList = new PrunedPartitionList(table, confirmedPartns, partCols, false);
                        GenMapRedUtils.setTaskPlan(currAliasId, op, currTask, false, ctx, partList);
                    } else {
                        // non-partitioned table
                        GenMapRedUtils.setTaskPlan(currAliasId, op, currTask, false, ctx);
                    }
                }
            }
            return true;
        }
    }
    assert false;
    return null;
}
Also used : TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) Partition(org.apache.hadoop.hive.ql.metadata.Partition) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Table(org.apache.hadoop.hive.ql.metadata.Table) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) MapredParquetInputFormat(org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat) PrunedPartitionList(org.apache.hadoop.hive.ql.parse.PrunedPartitionList) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) StatsWork(org.apache.hadoop.hive.ql.plan.StatsWork) BasicStatsWork(org.apache.hadoop.hive.ql.plan.BasicStatsWork) OrcInputFormat(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) GenMapRedCtx(org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx) BasicStatsWork(org.apache.hadoop.hive.ql.plan.BasicStatsWork) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 77 with Partition

use of org.apache.hadoop.hive.ql.metadata.Partition in project hive by apache.

the class TestHCatMultiOutputFormat method getTableData.

/**
 * Method to fetch table data
 *
 * @param table table name
 * @param database database
 * @return list of columns in comma seperated way
 * @throws Exception if any error occurs
 */
private List<String> getTableData(String table, String database) throws Exception {
    QueryState queryState = new QueryState.Builder().build();
    HiveConf conf = queryState.getConf();
    conf.addResource("hive-site.xml");
    ArrayList<String> results = new ArrayList<String>();
    ArrayList<String> temp = new ArrayList<String>();
    Hive hive = Hive.get(conf);
    org.apache.hadoop.hive.ql.metadata.Table tbl = hive.getTable(database, table);
    FetchWork work;
    if (!tbl.getPartCols().isEmpty()) {
        List<Partition> partitions = hive.getPartitions(tbl);
        List<PartitionDesc> partDesc = new ArrayList<PartitionDesc>();
        List<Path> partLocs = new ArrayList<Path>();
        TableDesc tableDesc = Utilities.getTableDesc(tbl);
        for (Partition part : partitions) {
            partLocs.add(part.getDataLocation());
            partDesc.add(Utilities.getPartitionDescFromTableDesc(tableDesc, part, true));
        }
        work = new FetchWork(partLocs, partDesc, tableDesc);
        work.setLimit(100);
    } else {
        work = new FetchWork(tbl.getDataLocation(), Utilities.getTableDesc(tbl));
    }
    FetchTask task = new FetchTask();
    task.setWork(work);
    task.initialize(queryState, null, null, new CompilationOpContext());
    task.fetch(temp);
    for (String str : temp) {
        results.add(str.replace("\t", ","));
    }
    return results;
}
Also used : Path(org.apache.hadoop.fs.Path) Partition(org.apache.hadoop.hive.ql.metadata.Partition) ArrayList(java.util.ArrayList) QueryState(org.apache.hadoop.hive.ql.QueryState) FetchTask(org.apache.hadoop.hive.ql.exec.FetchTask) Hive(org.apache.hadoop.hive.ql.metadata.Hive) CompilationOpContext(org.apache.hadoop.hive.ql.CompilationOpContext) FetchWork(org.apache.hadoop.hive.ql.plan.FetchWork) HiveConf(org.apache.hadoop.hive.conf.HiveConf) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc)

Example 78 with Partition

use of org.apache.hadoop.hive.ql.metadata.Partition in project hive by apache.

the class LoadPartitions method forExistingTable.

private TaskTracker forExistingTable(AddPartitionDesc lastPartitionReplicated) throws Exception {
    boolean encounteredTheLastReplicatedPartition = (lastPartitionReplicated == null);
    Map<String, String> lastReplicatedPartSpec = null;
    if (!encounteredTheLastReplicatedPartition) {
        lastReplicatedPartSpec = lastPartitionReplicated.getPartition(0).getPartSpec();
        LOG.info("Start processing from partition info spec : {}", StringUtils.mapToString(lastReplicatedPartSpec));
    }
    ReplicationSpec replicationSpec = event.replicationSpec();
    Iterator<AddPartitionDesc> partitionIterator = event.partitionDescriptions(tableDesc).iterator();
    while (!encounteredTheLastReplicatedPartition && partitionIterator.hasNext()) {
        AddPartitionDesc addPartitionDesc = partitionIterator.next();
        Map<String, String> currentSpec = addPartitionDesc.getPartition(0).getPartSpec();
        encounteredTheLastReplicatedPartition = lastReplicatedPartSpec.equals(currentSpec);
    }
    while (partitionIterator.hasNext() && tracker.canAddMoreTasks()) {
        AddPartitionDesc addPartitionDesc = partitionIterator.next();
        Map<String, String> partSpec = addPartitionDesc.getPartition(0).getPartSpec();
        Partition ptn = context.hiveDb.getPartition(table, partSpec, false);
        if (ptn == null) {
            if (!replicationSpec.isMetadataOnly()) {
                addPartition(partitionIterator.hasNext(), addPartitionDesc);
            }
        } else {
            // the destination ptn's repl.last.id is older than the replacement's.
            if (replicationSpec.allowReplacementInto(ptn.getParameters())) {
                if (replicationSpec.isMetadataOnly()) {
                    tracker.addTask(alterSinglePartition(addPartitionDesc, replicationSpec, ptn));
                    if (!tracker.canAddMoreTasks()) {
                        tracker.setReplicationState(new ReplicationState(new PartitionState(table.getTableName(), addPartitionDesc)));
                    }
                } else {
                    addPartition(partitionIterator.hasNext(), addPartitionDesc);
                }
            } else {
            // ignore this ptn, do nothing, not an error.
            }
        }
    }
    return tracker;
}
Also used : Partition(org.apache.hadoop.hive.ql.metadata.Partition) ReplicationSpec(org.apache.hadoop.hive.ql.parse.ReplicationSpec) AddPartitionDesc(org.apache.hadoop.hive.ql.plan.AddPartitionDesc) PartitionState(org.apache.hadoop.hive.ql.exec.repl.bootstrap.load.ReplicationState.PartitionState) ImportSemanticAnalyzer.partSpecToString(org.apache.hadoop.hive.ql.parse.ImportSemanticAnalyzer.partSpecToString) ReplicationState(org.apache.hadoop.hive.ql.exec.repl.bootstrap.load.ReplicationState)

Example 79 with Partition

use of org.apache.hadoop.hive.ql.metadata.Partition in project hive by apache.

the class BasicStatsNoJobTask method aggregateStats.

private int aggregateStats(ExecutorService threadPool, Hive db) {
    int ret = 0;
    try {
        JobConf jc = new JobConf(conf);
        TableSpec tableSpecs = work.getTableSpecs();
        if (tableSpecs == null) {
            throw new RuntimeException("this is unexpected...needs some investigation");
        }
        Table table = tableSpecs.tableHandle;
        Collection<Partition> partitions = null;
        if (work.getPartitions() == null || work.getPartitions().isEmpty()) {
            if (table.isPartitioned()) {
                partitions = tableSpecs.partitions;
            }
        } else {
            partitions = work.getPartitions();
        }
        LinkedList<Partish> partishes = Lists.newLinkedList();
        if (partitions == null) {
            partishes.add(Partish.buildFor(table));
        } else {
            for (Partition part : partitions) {
                partishes.add(Partish.buildFor(table, part));
            }
        }
        List<FooterStatCollector> scs = Lists.newArrayList();
        for (Partish partish : partishes) {
            scs.add(new FooterStatCollector(jc, partish));
        }
        for (FooterStatCollector sc : scs) {
            sc.init(conf, console);
            threadPool.execute(sc);
        }
        LOG.debug("Stats collection waiting for threadpool to shutdown..");
        shutdownAndAwaitTermination(threadPool);
        LOG.debug("Stats collection threadpool shutdown successful.");
        ret = updatePartitions(db, scs, table);
    } catch (Exception e) {
        console.printError("Failed to collect footer statistics.", "Failed with exception " + e.getMessage() + "\n" + StringUtils.stringifyException(e));
        // Fail the query if the stats are supposed to be reliable
        if (work.isStatsReliable()) {
            ret = -1;
        }
    }
    // anything else indicates failure
    return ret;
}
Also used : Partition(org.apache.hadoop.hive.ql.metadata.Partition) TableSpec(org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.TableSpec) Table(org.apache.hadoop.hive.ql.metadata.Table) JobConf(org.apache.hadoop.mapred.JobConf) IOException(java.io.IOException) InvalidOperationException(org.apache.hadoop.hive.metastore.api.InvalidOperationException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException)

Example 80 with Partition

use of org.apache.hadoop.hive.ql.metadata.Partition in project hive by apache.

the class BasicStatsTask method getPartitionsList.

/**
 * Get the list of partitions that need to update statistics.
 * TODO: we should reuse the Partitions generated at compile time
 * since getting the list of partitions is quite expensive.
 *
 * @return a list of partitions that need to update statistics.
 * @throws HiveException
 */
private List<Partition> getPartitionsList(Hive db) throws HiveException {
    if (work.getLoadFileDesc() != null) {
        // we are in CTAS, so we know there are no partitions
        return null;
    }
    List<Partition> list = new ArrayList<Partition>();
    if (work.getTableSpecs() != null) {
        // ANALYZE command
        TableSpec tblSpec = work.getTableSpecs();
        table = tblSpec.tableHandle;
        if (!table.isPartitioned()) {
            return null;
        }
        // get all partitions that matches with the partition spec
        List<Partition> partitions = tblSpec.partitions;
        if (partitions != null) {
            for (Partition partn : partitions) {
                list.add(partn);
            }
        }
    } else if (work.getLoadTableDesc() != null) {
        // INSERT OVERWRITE command
        LoadTableDesc tbd = work.getLoadTableDesc();
        table = db.getTable(tbd.getTable().getTableName());
        if (!table.isPartitioned()) {
            return null;
        }
        DynamicPartitionCtx dpCtx = tbd.getDPCtx();
        if (dpCtx != null && dpCtx.getNumDPCols() > 0) {
            // If no dynamic partitions are generated, dpPartSpecs may not be initialized
            if (dpPartSpecs != null) {
                // load the list of DP partitions and return the list of partition specs
                list.addAll(dpPartSpecs);
            }
        } else {
            // static partition
            Partition partn = db.getPartition(table, tbd.getPartitionSpec(), false);
            list.add(partn);
        }
    }
    return list;
}
Also used : Partition(org.apache.hadoop.hive.ql.metadata.Partition) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) TableSpec(org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.TableSpec) ArrayList(java.util.ArrayList) DynamicPartitionCtx(org.apache.hadoop.hive.ql.plan.DynamicPartitionCtx)

Aggregations

Partition (org.apache.hadoop.hive.ql.metadata.Partition)102 Table (org.apache.hadoop.hive.ql.metadata.Table)56 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)48 ArrayList (java.util.ArrayList)43 Path (org.apache.hadoop.fs.Path)25 AlterTableExchangePartition (org.apache.hadoop.hive.ql.plan.AlterTableExchangePartition)25 WriteEntity (org.apache.hadoop.hive.ql.hooks.WriteEntity)24 IOException (java.io.IOException)18 HashMap (java.util.HashMap)18 LinkedHashMap (java.util.LinkedHashMap)18 ReadEntity (org.apache.hadoop.hive.ql.hooks.ReadEntity)18 SemanticException (org.apache.hadoop.hive.ql.parse.SemanticException)18 PrunedPartitionList (org.apache.hadoop.hive.ql.parse.PrunedPartitionList)17 MetaException (org.apache.hadoop.hive.metastore.api.MetaException)14 FileNotFoundException (java.io.FileNotFoundException)12 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)12 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)12 InvalidOperationException (org.apache.hadoop.hive.metastore.api.InvalidOperationException)11 SQLCheckConstraint (org.apache.hadoop.hive.metastore.api.SQLCheckConstraint)11 SQLDefaultConstraint (org.apache.hadoop.hive.metastore.api.SQLDefaultConstraint)11