Search in sources :

Example 1 with HadoopArchives

use of org.apache.hadoop.tools.HadoopArchives in project hive by apache.

the class DDLTask method archive.

private int archive(Hive db, AlterTableSimpleDesc simpleDesc, DriverContext driverContext) throws HiveException {
    Table tbl = db.getTable(simpleDesc.getTableName());
    if (tbl.getTableType() != TableType.MANAGED_TABLE) {
        throw new HiveException("ARCHIVE can only be performed on managed tables");
    }
    Map<String, String> partSpec = simpleDesc.getPartSpec();
    PartSpecInfo partSpecInfo = PartSpecInfo.create(tbl, partSpec);
    List<Partition> partitions = db.getPartitions(tbl, partSpec);
    Path originalDir = null;
    // to keep backward compatibility
    if (partitions.isEmpty()) {
        throw new HiveException("No partition matches the specification");
    } else if (partSpecInfo.values.size() != tbl.getPartCols().size()) {
        // for partial specifications we need partitions to follow the scheme
        for (Partition p : partitions) {
            if (partitionInCustomLocation(tbl, p)) {
                String message = String.format("ARCHIVE cannot run for partition " + "groups with custom locations like %s", p.getLocation());
                throw new HiveException(message);
            }
        }
        originalDir = partSpecInfo.createPath(tbl);
    } else {
        Partition p = partitions.get(0);
        // partition can be archived if during recovery
        if (ArchiveUtils.isArchived(p)) {
            originalDir = new Path(getOriginalLocation(p));
        } else {
            originalDir = p.getDataLocation();
        }
    }
    Path intermediateArchivedDir = new Path(originalDir.getParent(), originalDir.getName() + INTERMEDIATE_ARCHIVED_DIR_SUFFIX);
    Path intermediateOriginalDir = new Path(originalDir.getParent(), originalDir.getName() + INTERMEDIATE_ORIGINAL_DIR_SUFFIX);
    console.printInfo("intermediate.archived is " + intermediateArchivedDir.toString());
    console.printInfo("intermediate.original is " + intermediateOriginalDir.toString());
    String archiveName = "data.har";
    FileSystem fs = null;
    try {
        fs = originalDir.getFileSystem(conf);
    } catch (IOException e) {
        throw new HiveException(e);
    }
    URI archiveUri = (new Path(originalDir, archiveName)).toUri();
    URI originalUri = ArchiveUtils.addSlash(originalDir.toUri());
    ArchiveUtils.HarPathHelper harHelper = new ArchiveUtils.HarPathHelper(conf, archiveUri, originalUri);
    // if they are different, we throw an error
    for (Partition p : partitions) {
        if (ArchiveUtils.isArchived(p)) {
            if (ArchiveUtils.getArchivingLevel(p) != partSpecInfo.values.size()) {
                String name = ArchiveUtils.getPartialName(p, ArchiveUtils.getArchivingLevel(p));
                String m = String.format("Conflict with existing archive %s", name);
                throw new HiveException(m);
            } else {
                throw new HiveException("Partition(s) already archived");
            }
        }
    }
    boolean recovery = false;
    if (pathExists(intermediateArchivedDir) || pathExists(intermediateOriginalDir)) {
        recovery = true;
        console.printInfo("Starting recovery after failed ARCHIVE");
    }
    // to use as the move operation that created it is atomic.
    if (!pathExists(intermediateArchivedDir) && !pathExists(intermediateOriginalDir)) {
        // First create the archive in a tmp dir so that if the job fails, the
        // bad files don't pollute the filesystem
        Path tmpPath = new Path(driverContext.getCtx().getExternalTmpPath(originalDir), "partlevel");
        console.printInfo("Creating " + archiveName + " for " + originalDir.toString());
        console.printInfo("in " + tmpPath);
        console.printInfo("Please wait... (this may take a while)");
        // Create the Hadoop archive
        int ret = 0;
        try {
            int maxJobNameLen = conf.getIntVar(HiveConf.ConfVars.HIVEJOBNAMELENGTH);
            String jobname = String.format("Archiving %s@%s", tbl.getTableName(), partSpecInfo.getName());
            jobname = Utilities.abbreviate(jobname, maxJobNameLen - 6);
            conf.set(MRJobConfig.JOB_NAME, jobname);
            HadoopArchives har = new HadoopArchives(conf);
            List<String> args = new ArrayList<String>();
            args.add("-archiveName");
            args.add(archiveName);
            args.add("-p");
            args.add(originalDir.toString());
            args.add(tmpPath.toString());
            ret = ToolRunner.run(har, args.toArray(new String[0]));
        } catch (Exception e) {
            throw new HiveException(e);
        }
        if (ret != 0) {
            throw new HiveException("Error while creating HAR");
        }
        // the partition directory. e.g. .../hr=12-intermediate-archived
        try {
            console.printInfo("Moving " + tmpPath + " to " + intermediateArchivedDir);
            if (pathExists(intermediateArchivedDir)) {
                throw new HiveException("The intermediate archive directory already exists.");
            }
            fs.rename(tmpPath, intermediateArchivedDir);
        } catch (IOException e) {
            throw new HiveException("Error while moving tmp directory");
        }
    } else {
        if (pathExists(intermediateArchivedDir)) {
            console.printInfo("Intermediate archive directory " + intermediateArchivedDir + " already exists. Assuming it contains an archived version of the partition");
        }
    }
    // if the move hasn't been made already
    if (!pathExists(intermediateOriginalDir)) {
        console.printInfo("Moving " + originalDir + " to " + intermediateOriginalDir);
        moveDir(fs, originalDir, intermediateOriginalDir);
    } else {
        console.printInfo(intermediateOriginalDir + " already exists. " + "Assuming it contains the original files in the partition");
    }
    // Move the intermediate archived directory to the original parent directory
    if (!pathExists(originalDir)) {
        console.printInfo("Moving " + intermediateArchivedDir + " to " + originalDir);
        moveDir(fs, intermediateArchivedDir, originalDir);
    } else {
        console.printInfo(originalDir + " already exists. " + "Assuming it contains the archived version of the partition");
    }
    // Record this change in the metastore
    try {
        for (Partition p : partitions) {
            URI originalPartitionUri = ArchiveUtils.addSlash(p.getDataLocation().toUri());
            URI harPartitionDir = harHelper.getHarUri(originalPartitionUri);
            StringBuilder authority = new StringBuilder();
            if (harPartitionDir.getUserInfo() != null) {
                authority.append(harPartitionDir.getUserInfo()).append("@");
            }
            authority.append(harPartitionDir.getHost());
            if (harPartitionDir.getPort() != -1) {
                authority.append(":").append(harPartitionDir.getPort());
            }
            Path harPath = new Path(harPartitionDir.getScheme(), authority.toString(), // make in Path to ensure no slash at the end
            harPartitionDir.getPath());
            setArchived(p, harPath, partSpecInfo.values.size());
            db.alterPartition(simpleDesc.getTableName(), p, null);
        }
    } catch (Exception e) {
        throw new HiveException("Unable to change the partition info for HAR", e);
    }
    // will not be deleted. The user will run ARCHIVE again to clear this up
    if (pathExists(intermediateOriginalDir)) {
        deleteDir(intermediateOriginalDir);
    }
    if (recovery) {
        console.printInfo("Recovery after ARCHIVE succeeded");
    }
    return 0;
}
Also used : Path(org.apache.hadoop.fs.Path) Partition(org.apache.hadoop.hive.ql.metadata.Partition) AlterTableExchangePartition(org.apache.hadoop.hive.ql.plan.AlterTableExchangePartition) Table(org.apache.hadoop.hive.ql.metadata.Table) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) ArrayList(java.util.ArrayList) IOException(java.io.IOException) URI(java.net.URI) AlreadyExistsException(org.apache.hadoop.hive.metastore.api.AlreadyExistsException) InvalidOperationException(org.apache.hadoop.hive.metastore.api.InvalidOperationException) IOException(java.io.IOException) NoSuchObjectException(org.apache.hadoop.hive.metastore.api.NoSuchObjectException) MetaException(org.apache.hadoop.hive.metastore.api.MetaException) URISyntaxException(java.net.URISyntaxException) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) StringUtils.stringifyException(org.apache.hadoop.util.StringUtils.stringifyException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) SQLException(java.sql.SQLException) FileNotFoundException(java.io.FileNotFoundException) HiveAuthzPluginException(org.apache.hadoop.hive.ql.security.authorization.plugin.HiveAuthzPluginException) InvalidTableException(org.apache.hadoop.hive.ql.metadata.InvalidTableException) PartSpecInfo(org.apache.hadoop.hive.ql.exec.ArchiveUtils.PartSpecInfo) FileSystem(org.apache.hadoop.fs.FileSystem) HadoopArchives(org.apache.hadoop.tools.HadoopArchives)

Example 2 with HadoopArchives

use of org.apache.hadoop.tools.HadoopArchives in project hive by apache.

the class HarOutputCommitterPostProcessor method makeHar.

/**
   * Creates a har file from the contents of a given directory, using that as root.
   * @param dir Directory to archive
   * @param harFile The HAR file to create
   */
public static void makeHar(JobContext context, String dir, String harFile) throws IOException {
    //    Configuration conf = context.getConfiguration();
    //    Credentials creds = context.getCredentials();
    //    HCatUtil.logAllTokens(LOG,context);
    int lastSep = harFile.lastIndexOf(Path.SEPARATOR_CHAR);
    Path archivePath = new Path(harFile.substring(0, lastSep));
    final String[] args = { "-archiveName", harFile.substring(lastSep + 1, harFile.length()), "-p", dir, "*", archivePath.toString() };
    //    }
    try {
        Configuration newConf = new Configuration();
        FileSystem fs = archivePath.getFileSystem(newConf);
        String hadoopTokenFileLocationEnvSetting = System.getenv(HCatConstants.SYSENV_HADOOP_TOKEN_FILE_LOCATION);
        if ((hadoopTokenFileLocationEnvSetting != null) && (!hadoopTokenFileLocationEnvSetting.isEmpty())) {
            newConf.set(HCatConstants.CONF_MAPREDUCE_JOB_CREDENTIALS_BINARY, hadoopTokenFileLocationEnvSetting);
        //      LOG.info("System.getenv(\"HADOOP_TOKEN_FILE_LOCATION\") =["+  System.getenv("HADOOP_TOKEN_FILE_LOCATION")+"]");
        }
        //      for (FileStatus ds : fs.globStatus(new Path(dir, "*"))){
        //        LOG.info("src : "+ds.getPath().toUri().toString());
        //      }
        final HadoopArchives har = new HadoopArchives(newConf);
        int rc = ToolRunner.run(har, args);
        if (rc != 0) {
            throw new Exception("Har returned error code " + rc);
        }
        //      for (FileStatus hs : fs.globStatus(new Path(harFile, "*"))){
        //        LOG.info("dest : "+hs.getPath().toUri().toString());
        //      }
        //      doHarCheck(fs,harFile);
        //      LOG.info("Nuking " + dir);
        fs.delete(new Path(dir), true);
    } catch (Exception e) {
        throw new HCatException("Error creating Har [" + harFile + "] from [" + dir + "]", e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) FileSystem(org.apache.hadoop.fs.FileSystem) HCatException(org.apache.hive.hcatalog.common.HCatException) HadoopArchives(org.apache.hadoop.tools.HadoopArchives) HCatException(org.apache.hive.hcatalog.common.HCatException) IOException(java.io.IOException)

Aggregations

IOException (java.io.IOException)2 FileSystem (org.apache.hadoop.fs.FileSystem)2 Path (org.apache.hadoop.fs.Path)2 HadoopArchives (org.apache.hadoop.tools.HadoopArchives)2 FileNotFoundException (java.io.FileNotFoundException)1 URI (java.net.URI)1 URISyntaxException (java.net.URISyntaxException)1 SQLException (java.sql.SQLException)1 ArrayList (java.util.ArrayList)1 Configuration (org.apache.hadoop.conf.Configuration)1 AlreadyExistsException (org.apache.hadoop.hive.metastore.api.AlreadyExistsException)1 InvalidOperationException (org.apache.hadoop.hive.metastore.api.InvalidOperationException)1 MetaException (org.apache.hadoop.hive.metastore.api.MetaException)1 NoSuchObjectException (org.apache.hadoop.hive.metastore.api.NoSuchObjectException)1 PartSpecInfo (org.apache.hadoop.hive.ql.exec.ArchiveUtils.PartSpecInfo)1 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)1 InvalidTableException (org.apache.hadoop.hive.ql.metadata.InvalidTableException)1 Partition (org.apache.hadoop.hive.ql.metadata.Partition)1 Table (org.apache.hadoop.hive.ql.metadata.Table)1 SemanticException (org.apache.hadoop.hive.ql.parse.SemanticException)1