Search in sources :

Example 21 with BZip2CompressorInputStream

use of org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream in project hive by apache.

the class QTestUtil method setupMetaStoreTableColumnStatsFor30TBTPCDSWorkload.

public static void setupMetaStoreTableColumnStatsFor30TBTPCDSWorkload(HiveConf conf) {
    Connection conn = null;
    // list of Statements, PreparedStatements
    ArrayList<Statement> statements = new ArrayList<Statement>();
    try {
        // connection properties
        Properties props = new Properties();
        props.put("user", conf.get("javax.jdo.option.ConnectionUserName"));
        props.put("password", conf.get("javax.jdo.option.ConnectionPassword"));
        conn = DriverManager.getConnection(conf.get("javax.jdo.option.ConnectionURL"), props);
        ResultSet rs = null;
        Statement s = conn.createStatement();
        if (LOG.isDebugEnabled()) {
            LOG.debug("Connected to metastore database ");
        }
        String mdbPath = AbstractCliConfig.HIVE_ROOT + "/data/files/tpcds-perf/metastore_export/";
        // Setup the table column stats
        BufferedReader br = new BufferedReader(new FileReader(new File(AbstractCliConfig.HIVE_ROOT + "/metastore/scripts/upgrade/derby/022-HIVE-11107.derby.sql")));
        String command;
        s.execute("DROP TABLE APP.TABLE_PARAMS");
        s.execute("DROP TABLE APP.TAB_COL_STATS");
        // Create the column stats table
        while ((command = br.readLine()) != null) {
            if (!command.endsWith(";")) {
                continue;
            }
            if (LOG.isDebugEnabled()) {
                LOG.debug("Going to run command : " + command);
            }
            try {
                PreparedStatement psCommand = conn.prepareStatement(command.substring(0, command.length() - 1));
                statements.add(psCommand);
                psCommand.execute();
                if (LOG.isDebugEnabled()) {
                    LOG.debug("successfully completed " + command);
                }
            } catch (SQLException e) {
                LOG.info("Got SQL Exception " + e.getMessage());
            }
        }
        br.close();
        java.nio.file.Path tabColStatsCsv = FileSystems.getDefault().getPath(mdbPath, "csv", "TAB_COL_STATS.txt.bz2");
        java.nio.file.Path tabParamsCsv = FileSystems.getDefault().getPath(mdbPath, "csv", "TABLE_PARAMS.txt.bz2");
        // Set up the foreign key constraints properly in the TAB_COL_STATS data
        String tmpBaseDir = System.getProperty(TEST_TMP_DIR_PROPERTY);
        java.nio.file.Path tmpFileLoc1 = FileSystems.getDefault().getPath(tmpBaseDir, "TAB_COL_STATS.txt");
        java.nio.file.Path tmpFileLoc2 = FileSystems.getDefault().getPath(tmpBaseDir, "TABLE_PARAMS.txt");
        class MyComp implements Comparator<String> {

            @Override
            public int compare(String str1, String str2) {
                if (str2.length() != str1.length()) {
                    return str2.length() - str1.length();
                }
                return str1.compareTo(str2);
            }
        }
        final SortedMap<String, Integer> tableNameToID = new TreeMap<String, Integer>(new MyComp());
        rs = s.executeQuery("SELECT * FROM APP.TBLS");
        while (rs.next()) {
            String tblName = rs.getString("TBL_NAME");
            Integer tblId = rs.getInt("TBL_ID");
            tableNameToID.put(tblName, tblId);
            if (LOG.isDebugEnabled()) {
                LOG.debug("Resultset : " + tblName + " | " + tblId);
            }
        }
        final Map<String, Map<String, String>> data = new HashMap<>();
        rs = s.executeQuery("select TBLS.TBL_NAME, a.COLUMN_NAME, a.TYPE_NAME from  " + "(select COLUMN_NAME, TYPE_NAME, SDS.SD_ID from APP.COLUMNS_V2 join APP.SDS on SDS.CD_ID = COLUMNS_V2.CD_ID) a" + " join APP.TBLS on  TBLS.SD_ID = a.SD_ID");
        while (rs.next()) {
            String tblName = rs.getString(1);
            String colName = rs.getString(2);
            String typeName = rs.getString(3);
            Map<String, String> cols = data.get(tblName);
            if (null == cols) {
                cols = new HashMap<>();
            }
            cols.put(colName, typeName);
            data.put(tblName, cols);
        }
        BufferedReader reader = new BufferedReader(new InputStreamReader(new BZip2CompressorInputStream(Files.newInputStream(tabColStatsCsv, StandardOpenOption.READ))));
        Stream<String> replaced = reader.lines().parallel().map(str -> {
            String[] splits = str.split(",");
            String tblName = splits[0];
            String colName = splits[1];
            Integer tblID = tableNameToID.get(tblName);
            StringBuilder sb = new StringBuilder("default@" + tblName + "@" + colName + "@" + data.get(tblName).get(colName) + "@");
            for (int i = 2; i < splits.length; i++) {
                sb.append(splits[i] + "@");
            }
            // Add tbl_id and empty bitvector
            return sb.append(tblID).append("@").toString();
        });
        Files.write(tmpFileLoc1, (Iterable<String>) replaced::iterator);
        replaced.close();
        reader.close();
        BufferedReader reader2 = new BufferedReader(new InputStreamReader(new BZip2CompressorInputStream(Files.newInputStream(tabParamsCsv, StandardOpenOption.READ))));
        final Map<String, String> colStats = new ConcurrentHashMap<>();
        Stream<String> replacedStream = reader2.lines().parallel().map(str -> {
            String[] splits = str.split("_@");
            String tblName = splits[0];
            Integer tblId = tableNameToID.get(tblName);
            Map<String, String> cols = data.get(tblName);
            StringBuilder sb = new StringBuilder();
            sb.append("{\"COLUMN_STATS\":{");
            for (String colName : cols.keySet()) {
                sb.append("\"" + colName + "\":\"true\",");
            }
            sb.append("},\"BASIC_STATS\":\"true\"}");
            colStats.put(tblId.toString(), sb.toString());
            return tblId.toString() + "@" + splits[1];
        });
        Files.write(tmpFileLoc2, (Iterable<String>) replacedStream::iterator);
        Files.write(tmpFileLoc2, (Iterable<String>) colStats.entrySet().stream().map(map -> map.getKey() + "@COLUMN_STATS_ACCURATE@" + map.getValue())::iterator, StandardOpenOption.APPEND);
        replacedStream.close();
        reader2.close();
        // Load the column stats and table params with 30 TB scale
        String importStatement1 = "CALL SYSCS_UTIL.SYSCS_IMPORT_TABLE(null, '" + "TAB_COL_STATS" + "', '" + tmpFileLoc1.toAbsolutePath().toString() + "', '@', null, 'UTF-8', 1)";
        String importStatement2 = "CALL SYSCS_UTIL.SYSCS_IMPORT_TABLE(null, '" + "TABLE_PARAMS" + "', '" + tmpFileLoc2.toAbsolutePath().toString() + "', '@', null, 'UTF-8', 1)";
        try {
            PreparedStatement psImport1 = conn.prepareStatement(importStatement1);
            if (LOG.isDebugEnabled()) {
                LOG.debug("Going to execute : " + importStatement1);
            }
            statements.add(psImport1);
            psImport1.execute();
            if (LOG.isDebugEnabled()) {
                LOG.debug("successfully completed " + importStatement1);
            }
            PreparedStatement psImport2 = conn.prepareStatement(importStatement2);
            if (LOG.isDebugEnabled()) {
                LOG.debug("Going to execute : " + importStatement2);
            }
            statements.add(psImport2);
            psImport2.execute();
            if (LOG.isDebugEnabled()) {
                LOG.debug("successfully completed " + importStatement2);
            }
        } catch (SQLException e) {
            LOG.info("Got SQL Exception  " + e.getMessage());
        }
    } catch (FileNotFoundException e1) {
        LOG.info("Got File not found Exception " + e1.getMessage());
    } catch (IOException e1) {
        LOG.info("Got IOException " + e1.getMessage());
    } catch (SQLException e1) {
        LOG.info("Got SQLException " + e1.getMessage());
    } finally {
        // Statements and PreparedStatements
        int i = 0;
        while (!statements.isEmpty()) {
            // PreparedStatement extend Statement
            Statement st = statements.remove(i);
            try {
                if (st != null) {
                    st.close();
                    st = null;
                }
            } catch (SQLException sqle) {
            }
        }
        // Connection
        try {
            if (conn != null) {
                conn.close();
                conn = null;
            }
        } catch (SQLException sqle) {
        }
    }
}
Also used : Arrays(java.util.Arrays) StringUtils(org.apache.commons.lang.StringUtils) BufferedInputStream(java.io.BufferedInputStream) SemanticAnalyzer(org.apache.hadoop.hive.ql.parse.SemanticAnalyzer) FileSystem(org.apache.hadoop.fs.FileSystem) FileStatus(org.apache.hadoop.fs.FileStatus) FunctionRegistry(org.apache.hadoop.hive.ql.exec.FunctionRegistry) TestSuite(junit.framework.TestSuite) Matcher(java.util.regex.Matcher) ResultSet(java.sql.ResultSet) Map(java.util.Map) Configuration(org.apache.hadoop.conf.Configuration) EnumSet(java.util.EnumSet) ZooKeeper(org.apache.zookeeper.ZooKeeper) MiniZooKeeperCluster(org.apache.hadoop.hbase.zookeeper.MiniZooKeeperCluster) HadoopShims(org.apache.hadoop.hive.shims.HadoopShims) CommonConfigurationKeysPublic(org.apache.hadoop.fs.CommonConfigurationKeysPublic) ByteArrayOutputStream(org.apache.commons.io.output.ByteArrayOutputStream) Set(java.util.Set) ASTNode(org.apache.hadoop.hive.ql.parse.ASTNode) PreparedStatement(java.sql.PreparedStatement) StandardCharsets(java.nio.charset.StandardCharsets) Serializable(java.io.Serializable) AbstractCliConfig(org.apache.hadoop.hive.cli.control.AbstractCliConfig) CommandProcessor(org.apache.hadoop.hive.ql.processors.CommandProcessor) IOUtils(org.apache.commons.io.IOUtils) SparkSessionManagerImpl(org.apache.hadoop.hive.ql.exec.spark.session.SparkSessionManagerImpl) MiniDruidCluster(org.apache.hive.druid.MiniDruidCluster) Stream(java.util.stream.Stream) MetastoreConf(org.apache.hadoop.hive.metastore.conf.MetastoreConf) Strings(org.apache.logging.log4j.util.Strings) StreamPrinter(org.apache.hive.common.util.StreamPrinter) FilenameFilter(java.io.FilenameFilter) TezSessionState(org.apache.hadoop.hive.ql.exec.tez.TezSessionState) QueryResultsCache(org.apache.hadoop.hive.ql.cache.results.QueryResultsCache) HiveCommand(org.apache.hadoop.hive.ql.processors.HiveCommand) DEFAULT_DATABASE_NAME(org.apache.hadoop.hive.metastore.Warehouse.DEFAULT_DATABASE_NAME) BufferedOutputStream(java.io.BufferedOutputStream) ArrayList(java.util.ArrayList) Task(org.apache.hadoop.hive.ql.exec.Task) CommandProcessorFactory(org.apache.hadoop.hive.ql.processors.CommandProcessorFactory) SQLException(java.sql.SQLException) SortAndDigestPrintStream(org.apache.hadoop.hive.common.io.SortAndDigestPrintStream) DatasetParser(org.apache.hadoop.hive.ql.dataset.DatasetParser) Properties(java.util.Properties) Files(java.nio.file.Files) Watcher(org.apache.zookeeper.Watcher) StringWriter(java.io.StringWriter) FileOutputStream(java.io.FileOutputStream) Throwables(com.google.common.base.Throwables) IOException(java.io.IOException) Table(org.apache.hadoop.hive.ql.metadata.Table) BuildException(org.apache.tools.ant.BuildException) InputStreamReader(java.io.InputStreamReader) CliSessionState(org.apache.hadoop.hive.cli.CliSessionState) File(java.io.File) ParseDriver(org.apache.hadoop.hive.ql.parse.ParseDriver) CommandProcessorResponse(org.apache.hadoop.hive.ql.processors.CommandProcessorResponse) TreeMap(java.util.TreeMap) Statement(java.sql.Statement) Preconditions(com.google.common.base.Preconditions) BufferedReader(java.io.BufferedReader) Assert(org.junit.Assert) FileSystems(java.nio.file.FileSystems) Connection(java.sql.Connection) URL(java.net.URL) ConfVars(org.apache.hadoop.hive.conf.HiveConf.ConfVars) LoggerFactory(org.slf4j.LoggerFactory) BaseSemanticAnalyzer(org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) HiveMaterializedViewsRegistry(org.apache.hadoop.hive.ql.metadata.HiveMaterializedViewsRegistry) Warehouse(org.apache.hadoop.hive.metastore.Warehouse) ParseException(org.apache.hadoop.hive.ql.parse.ParseException) CuratorFrameworkSingleton(org.apache.hadoop.hive.ql.lockmgr.zookeeper.CuratorFrameworkSingleton) Path(org.apache.hadoop.fs.Path) ZooKeeperHiveLockManager(org.apache.hadoop.hive.ql.lockmgr.zookeeper.ZooKeeperHiveLockManager) Collection(java.util.Collection) StandardOpenOption(java.nio.file.StandardOpenOption) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) CachingPrintStream(org.apache.hadoop.hive.common.io.CachingPrintStream) SessionState(org.apache.hadoop.hive.ql.session.SessionState) FileNotFoundException(java.io.FileNotFoundException) List(java.util.List) CliDriver(org.apache.hadoop.hive.cli.CliDriver) Pattern(java.util.regex.Pattern) Dataset(org.apache.hadoop.hive.ql.dataset.Dataset) SortedMap(java.util.SortedMap) SparkSession(org.apache.hadoop.hive.ql.exec.spark.session.SparkSession) LlapItUtils(org.apache.hadoop.hive.llap.LlapItUtils) HashMap(java.util.HashMap) LlapProxy(org.apache.hadoop.hive.llap.io.api.LlapProxy) Deque(java.util.Deque) HashSet(java.util.HashSet) Utilities(org.apache.hadoop.hive.ql.exec.Utilities) MiniLlapCluster(org.apache.hadoop.hive.llap.daemon.MiniLlapCluster) ImmutableList(com.google.common.collect.ImmutableList) DigestPrintStream(org.apache.hadoop.hive.common.io.DigestPrintStream) LinkedList(java.util.LinkedList) BZip2CompressorInputStream(org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream) SortPrintStream(org.apache.hadoop.hive.common.io.SortPrintStream) OutputStream(java.io.OutputStream) PrintStream(java.io.PrintStream) Hive(org.apache.hadoop.hive.ql.metadata.Hive) Logger(org.slf4j.Logger) FileWriter(java.io.FileWriter) HiveConf(org.apache.hadoop.hive.conf.HiveConf) FileInputStream(java.io.FileInputStream) WatchedEvent(org.apache.zookeeper.WatchedEvent) TimeUnit(java.util.concurrent.TimeUnit) ShimLoader(org.apache.hadoop.hive.shims.ShimLoader) DatasetCollection(org.apache.hadoop.hive.ql.dataset.DatasetCollection) FileReader(java.io.FileReader) Comparator(java.util.Comparator) InvalidTableException(org.apache.hadoop.hive.ql.metadata.InvalidTableException) DriverManager(java.sql.DriverManager) SQLException(java.sql.SQLException) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) FileNotFoundException(java.io.FileNotFoundException) Properties(java.util.Properties) Comparator(java.util.Comparator) BZip2CompressorInputStream(org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream) ResultSet(java.sql.ResultSet) FileReader(java.io.FileReader) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) InputStreamReader(java.io.InputStreamReader) PreparedStatement(java.sql.PreparedStatement) Statement(java.sql.Statement) Connection(java.sql.Connection) PreparedStatement(java.sql.PreparedStatement) IOException(java.io.IOException) TreeMap(java.util.TreeMap) BufferedReader(java.io.BufferedReader) File(java.io.File) Map(java.util.Map) TreeMap(java.util.TreeMap) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) SortedMap(java.util.SortedMap) HashMap(java.util.HashMap)

Aggregations

BZip2CompressorInputStream (org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream)21 FileInputStream (java.io.FileInputStream)12 IOException (java.io.IOException)7 InputStream (java.io.InputStream)7 File (java.io.File)6 GZIPInputStream (java.util.zip.GZIPInputStream)6 FileOutputStream (java.io.FileOutputStream)5 BufferedReader (java.io.BufferedReader)4 InputStreamReader (java.io.InputStreamReader)4 GzipCompressorInputStream (org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream)4 BufferedInputStream (java.io.BufferedInputStream)3 TarArchiveInputStream (org.apache.commons.compress.archivers.tar.TarArchiveInputStream)3 FileNotFoundException (java.io.FileNotFoundException)2 FileWriter (java.io.FileWriter)2 OutputStream (java.io.OutputStream)2 URL (java.net.URL)2 ByteBuffer (java.nio.ByteBuffer)2 ArchiveInputStream (org.apache.commons.compress.archivers.ArchiveInputStream)2 ZipArchiveInputStream (org.apache.commons.compress.archivers.zip.ZipArchiveInputStream)2 CompressorInputStream (org.apache.commons.compress.compressors.CompressorInputStream)2