Search in sources :

Example 51 with SparkConf

use of org.apache.spark.SparkConf in project hive by apache.

the class ScriptOperator method process.

@Override
public void process(Object row, int tag) throws HiveException {
    // initialize the user's process only when you receive the first row
    if (firstRow) {
        firstRow = false;
        SparkConf sparkConf = null;
        try {
            String[] cmdArgs = splitArgs(conf.getScriptCmd());
            String prog = cmdArgs[0];
            File currentDir = new File(".").getAbsoluteFile();
            if (!new File(prog).isAbsolute()) {
                PathFinder finder = new PathFinder("PATH");
                finder.prependPathComponent(currentDir.toString());
                // In spark local mode, we need to search added files in root directory.
                if (HiveConf.getVar(hconf, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("spark")) {
                    sparkConf = SparkEnv.get().conf();
                    finder.prependPathComponent(SparkFiles.getRootDirectory());
                }
                File f = finder.getAbsolutePath(prog);
                if (f != null) {
                    cmdArgs[0] = f.getAbsolutePath();
                }
                f = null;
            }
            String[] wrappedCmdArgs = addWrapper(cmdArgs);
            if (isLogInfoEnabled) {
                LOG.info("Executing " + Arrays.asList(wrappedCmdArgs));
                LOG.info("tablename=" + tableName);
                LOG.info("partname=" + partitionName);
                LOG.info("alias=" + alias);
            }
            ProcessBuilder pb = new ProcessBuilder(wrappedCmdArgs);
            Map<String, String> env = pb.environment();
            addJobConfToEnvironment(hconf, env);
            env.put(safeEnvVarName(HiveConf.ConfVars.HIVEALIAS.varname), String.valueOf(alias));
            // Create an environment variable that uniquely identifies this script
            // operator
            String idEnvVarName = HiveConf.getVar(hconf, HiveConf.ConfVars.HIVESCRIPTIDENVVAR);
            String idEnvVarVal = getOperatorId();
            env.put(safeEnvVarName(idEnvVarName), idEnvVarVal);
            // in order to make the dependencies accessible.
            if (sparkConf != null) {
                String master = sparkConf.get("spark.master");
                if (master.equals("local") || master.startsWith("local[")) {
                    pb.directory(new File(SparkFiles.getRootDirectory()));
                }
            }
            // Runtime.getRuntime().exec(wrappedCmdArgs);
            scriptPid = pb.start();
            DataOutputStream scriptOut = new DataOutputStream(new BufferedOutputStream(scriptPid.getOutputStream()));
            DataInputStream scriptIn = new DataInputStream(new BufferedInputStream(scriptPid.getInputStream()));
            DataInputStream scriptErr = new DataInputStream(new BufferedInputStream(scriptPid.getErrorStream()));
            scriptOutWriter = conf.getInRecordWriterClass().newInstance();
            scriptOutWriter.initialize(scriptOut, hconf);
            RecordReader scriptOutputReader = conf.getOutRecordReaderClass().newInstance();
            scriptOutputReader.initialize(scriptIn, hconf, conf.getScriptOutputInfo().getProperties());
            outThread = new StreamThread(scriptOutputReader, new OutputStreamProcessor(scriptOutputDeserializer.getObjectInspector()), "OutputProcessor");
            RecordReader scriptErrReader = conf.getErrRecordReaderClass().newInstance();
            scriptErrReader.initialize(scriptErr, hconf, conf.getScriptErrInfo().getProperties());
            errThread = new StreamThread(scriptErrReader, new ErrorStreamProcessor(HiveConf.getIntVar(hconf, HiveConf.ConfVars.SCRIPTERRORLIMIT)), "ErrorProcessor");
            if (HiveConf.getBoolVar(hconf, HiveConf.ConfVars.HIVESCRIPTAUTOPROGRESS)) {
                autoProgressor = new AutoProgressor(this.getClass().getName(), reporter, Utilities.getDefaultNotificationInterval(hconf), HiveConf.getTimeVar(hconf, HiveConf.ConfVars.HIVES_AUTO_PROGRESS_TIMEOUT, TimeUnit.MILLISECONDS));
                autoProgressor.go();
            }
            outThread.start();
            errThread.start();
        } catch (Exception e) {
            throw new HiveException(ErrorMsg.SCRIPT_INIT_ERROR.getErrorCodedMsg(), e);
        }
    }
    if (scriptError != null) {
        throw new HiveException(ErrorMsg.SCRIPT_GENERIC_ERROR.getErrorCodedMsg(), scriptError);
    }
    try {
        Writable res = scriptInputSerializer.serialize(row, inputObjInspectors[tag]);
        scriptOutWriter.write(res);
    } catch (SerDeException e) {
        LOG.error("Error in serializing the row: " + e.getMessage());
        scriptError = e;
        serialize_error_count.set(serialize_error_count.get() + 1);
        throw new HiveException(e);
    } catch (IOException e) {
        if (isBrokenPipeException(e) && allowPartialConsumption()) {
            // Give the outThread a chance to finish before marking the operator as done
            try {
                scriptPid.waitFor();
            } catch (InterruptedException interruptedException) {
            }
            // as done
            try {
                if (outThread != null) {
                    outThread.join(0);
                }
            } catch (Exception e2) {
                LOG.warn("Exception in closing outThread: " + StringUtils.stringifyException(e2));
            }
            setDone(true);
            LOG.warn("Got broken pipe during write: ignoring exception and setting operator to done");
        } else {
            LOG.error("Error in writing to script: " + e.getMessage());
            if (isBrokenPipeException(e)) {
                displayBrokenPipeInfo();
            }
            scriptError = e;
            throw new HiveException(ErrorMsg.SCRIPT_IO_ERROR.getErrorCodedMsg(), e);
        }
    }
}
Also used : HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) DataOutputStream(java.io.DataOutputStream) Writable(org.apache.hadoop.io.Writable) LongWritable(org.apache.hadoop.io.LongWritable) BytesWritable(org.apache.hadoop.io.BytesWritable) IOException(java.io.IOException) DataInputStream(java.io.DataInputStream) IOException(java.io.IOException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) BufferedInputStream(java.io.BufferedInputStream) SparkConf(org.apache.spark.SparkConf) File(java.io.File) BufferedOutputStream(java.io.BufferedOutputStream) SerDeException(org.apache.hadoop.hive.serde2.SerDeException)

Example 52 with SparkConf

use of org.apache.spark.SparkConf in project hive by apache.

the class SparkSessionImpl method getMemoryAndCores.

@Override
public ObjectPair<Long, Integer> getMemoryAndCores() throws Exception {
    SparkConf sparkConf = hiveSparkClient.getSparkConf();
    int numExecutors = hiveSparkClient.getExecutorCount();
    // at start-up, we may be unable to get number of executors
    if (numExecutors <= 0) {
        return new ObjectPair<Long, Integer>(-1L, -1);
    }
    int executorMemoryInMB = Utils.memoryStringToMb(sparkConf.get("spark.executor.memory", "512m"));
    double memoryFraction = 1.0 - sparkConf.getDouble("spark.storage.memoryFraction", 0.6);
    long totalMemory = (long) (numExecutors * executorMemoryInMB * memoryFraction * 1024 * 1024);
    int totalCores;
    String masterURL = sparkConf.get("spark.master");
    if (masterURL.startsWith("spark")) {
        totalCores = sparkConf.contains("spark.default.parallelism") ? sparkConf.getInt("spark.default.parallelism", 1) : hiveSparkClient.getDefaultParallelism();
        totalCores = Math.max(totalCores, numExecutors);
    } else {
        int coresPerExecutor = sparkConf.getInt("spark.executor.cores", 1);
        totalCores = numExecutors * coresPerExecutor;
    }
    totalCores = totalCores / sparkConf.getInt("spark.task.cpus", 1);
    long memoryPerTaskInBytes = totalMemory / totalCores;
    LOG.info("Spark cluster current has executors: " + numExecutors + ", total cores: " + totalCores + ", memory per executor: " + executorMemoryInMB + "M, memoryFraction: " + memoryFraction);
    return new ObjectPair<Long, Integer>(Long.valueOf(memoryPerTaskInBytes), Integer.valueOf(totalCores));
}
Also used : SparkConf(org.apache.spark.SparkConf) ObjectPair(org.apache.hadoop.hive.common.ObjectPair)

Example 53 with SparkConf

use of org.apache.spark.SparkConf in project learning-spark by databricks.

the class BasicQueryCassandra method main.

public static void main(String[] args) throws Exception {
    if (args.length != 2) {
        throw new Exception("Usage BasicLoadJson [sparkMaster] [cassandraHost]");
    }
    String sparkMaster = args[0];
    String cassandraHost = args[1];
    SparkConf conf = new SparkConf(true).set("spark.cassandra.connection.host", cassandraHost);
    JavaSparkContext sc = new JavaSparkContext(sparkMaster, "basicquerycassandra", conf);
    // entire table as an RDD
    // assumes your table test was created as CREATE TABLE test.kv(key text PRIMARY KEY, value int);
    JavaRDD<CassandraRow> data = javaFunctions(sc).cassandraTable("test", "kv");
    // print some basic stats
    System.out.println(data.mapToDouble(new DoubleFunction<CassandraRow>() {

        public double call(CassandraRow row) {
            return row.getInt("value");
        }
    }).stats());
    // write some basic data to Cassandra
    ArrayList<KeyValue> input = new ArrayList<KeyValue>();
    input.add(KeyValue.newInstance("mostmagic", 3));
    JavaRDD<KeyValue> kvRDD = sc.parallelize(input);
    javaFunctions(kvRDD, KeyValue.class).saveToCassandra("test", "kv");
}
Also used : ArrayList(java.util.ArrayList) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) CassandraRow(com.datastax.spark.connector.CassandraRow) SparkConf(org.apache.spark.SparkConf)

Example 54 with SparkConf

use of org.apache.spark.SparkConf in project learning-spark by databricks.

the class SparkSQLTwitter method main.

public static void main(String[] args) {
    String inputFile = args[0];
    SparkConf conf = new SparkConf();
    JavaSparkContext sc = new JavaSparkContext(conf);
    SQLContext sqlCtx = new SQLContext(sc);
    DataFrame input = sqlCtx.jsonFile(inputFile);
    // Print the schema
    input.printSchema();
    // Register the input schema RDD
    input.registerTempTable("tweets");
    // Select tweets based on the retweetCount
    DataFrame topTweets = sqlCtx.sql("SELECT text, retweetCount FROM tweets ORDER BY retweetCount LIMIT 10");
    Row[] result = topTweets.collect();
    for (Row row : result) {
        System.out.println(row.get(0));
    }
    JavaRDD<String> topTweetText = topTweets.toJavaRDD().map(new Function<Row, String>() {

        public String call(Row row) {
            return row.getString(0);
        }
    });
    System.out.println(topTweetText.collect());
    // Create a person and turn it into a Schema RDD
    ArrayList<HappyPerson> peopleList = new ArrayList<HappyPerson>();
    peopleList.add(new HappyPerson("holden", "coffee"));
    JavaRDD<HappyPerson> happyPeopleRDD = sc.parallelize(peopleList);
    DataFrame happyPeopleSchemaRDD = sqlCtx.applySchema(happyPeopleRDD, HappyPerson.class);
    happyPeopleSchemaRDD.registerTempTable("happy_people");
    sqlCtx.udf().register("stringLengthJava", new UDF1<String, Integer>() {

        @Override
        public Integer call(String str) throws Exception {
            return str.length();
        }
    }, DataTypes.IntegerType);
    DataFrame tweetLength = sqlCtx.sql("SELECT stringLengthJava('text') FROM tweets LIMIT 10");
    Row[] lengths = tweetLength.collect();
    for (Row row : result) {
        System.out.println(row.get(0));
    }
    sc.stop();
}
Also used : ArrayList(java.util.ArrayList) DataFrame(org.apache.spark.sql.DataFrame) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Row(org.apache.spark.sql.Row) SparkConf(org.apache.spark.SparkConf) SQLContext(org.apache.spark.sql.SQLContext)

Example 55 with SparkConf

use of org.apache.spark.SparkConf in project learning-spark by databricks.

the class WordCount method main.

public static void main(String[] args) throws Exception {
    String inputFile = args[0];
    String outputFile = args[1];
    // Create a Java Spark Context.
    SparkConf conf = new SparkConf().setAppName("wordCount");
    JavaSparkContext sc = new JavaSparkContext(conf);
    // Load our input data.
    JavaRDD<String> input = sc.textFile(inputFile);
    // Split up into words.
    JavaRDD<String> words = input.flatMap(new FlatMapFunction<String, String>() {

        public Iterable<String> call(String x) {
            return Arrays.asList(x.split(" "));
        }
    });
    // Transform into word and count.
    JavaPairRDD<String, Integer> counts = words.mapToPair(new PairFunction<String, String, Integer>() {

        public Tuple2<String, Integer> call(String x) {
            return new Tuple2(x, 1);
        }
    }).reduceByKey(new Function2<Integer, Integer, Integer>() {

        public Integer call(Integer x, Integer y) {
            return x + y;
        }
    });
    // Save the word count back out to a text file, causing evaluation.
    counts.saveAsTextFile(outputFile);
}
Also used : Iterable(java.lang.Iterable) Tuple2(scala.Tuple2) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) PairFunction(org.apache.spark.api.java.function.PairFunction) SparkConf(org.apache.spark.SparkConf)

Aggregations

SparkConf (org.apache.spark.SparkConf)83 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)46 Test (org.junit.Test)21 ArrayList (java.util.ArrayList)20 Configuration (org.apache.hadoop.conf.Configuration)20 Tuple2 (scala.Tuple2)15 Graph (uk.gov.gchq.gaffer.graph.Graph)13 DataOutputStream (java.io.DataOutputStream)11 File (java.io.File)10 HashSet (java.util.HashSet)10 ByteArrayOutputStream (org.apache.commons.io.output.ByteArrayOutputStream)10 Edge (uk.gov.gchq.gaffer.data.element.Edge)10 Element (uk.gov.gchq.gaffer.data.element.Element)10 Entity (uk.gov.gchq.gaffer.data.element.Entity)10 User (uk.gov.gchq.gaffer.user.User)10 Ignore (org.junit.Ignore)6 HBaseConfiguration (org.apache.hadoop.hbase.HBaseConfiguration)5 JavaHBaseContext (org.apache.hadoop.hbase.spark.JavaHBaseContext)5 Test (org.testng.annotations.Test)5 AddElements (uk.gov.gchq.gaffer.operation.impl.add.AddElements)5