use of org.apache.spark.SparkConf in project hive by apache.
the class ScriptOperator method process.
@Override
public void process(Object row, int tag) throws HiveException {
// initialize the user's process only when you receive the first row
if (firstRow) {
firstRow = false;
SparkConf sparkConf = null;
try {
String[] cmdArgs = splitArgs(conf.getScriptCmd());
String prog = cmdArgs[0];
File currentDir = new File(".").getAbsoluteFile();
if (!new File(prog).isAbsolute()) {
PathFinder finder = new PathFinder("PATH");
finder.prependPathComponent(currentDir.toString());
// In spark local mode, we need to search added files in root directory.
if (HiveConf.getVar(hconf, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("spark")) {
sparkConf = SparkEnv.get().conf();
finder.prependPathComponent(SparkFiles.getRootDirectory());
}
File f = finder.getAbsolutePath(prog);
if (f != null) {
cmdArgs[0] = f.getAbsolutePath();
}
f = null;
}
String[] wrappedCmdArgs = addWrapper(cmdArgs);
if (isLogInfoEnabled) {
LOG.info("Executing " + Arrays.asList(wrappedCmdArgs));
LOG.info("tablename=" + tableName);
LOG.info("partname=" + partitionName);
LOG.info("alias=" + alias);
}
ProcessBuilder pb = new ProcessBuilder(wrappedCmdArgs);
Map<String, String> env = pb.environment();
addJobConfToEnvironment(hconf, env);
env.put(safeEnvVarName(HiveConf.ConfVars.HIVEALIAS.varname), String.valueOf(alias));
// Create an environment variable that uniquely identifies this script
// operator
String idEnvVarName = HiveConf.getVar(hconf, HiveConf.ConfVars.HIVESCRIPTIDENVVAR);
String idEnvVarVal = getOperatorId();
env.put(safeEnvVarName(idEnvVarName), idEnvVarVal);
// in order to make the dependencies accessible.
if (sparkConf != null) {
String master = sparkConf.get("spark.master");
if (master.equals("local") || master.startsWith("local[")) {
pb.directory(new File(SparkFiles.getRootDirectory()));
}
}
// Runtime.getRuntime().exec(wrappedCmdArgs);
scriptPid = pb.start();
DataOutputStream scriptOut = new DataOutputStream(new BufferedOutputStream(scriptPid.getOutputStream()));
DataInputStream scriptIn = new DataInputStream(new BufferedInputStream(scriptPid.getInputStream()));
DataInputStream scriptErr = new DataInputStream(new BufferedInputStream(scriptPid.getErrorStream()));
scriptOutWriter = conf.getInRecordWriterClass().newInstance();
scriptOutWriter.initialize(scriptOut, hconf);
RecordReader scriptOutputReader = conf.getOutRecordReaderClass().newInstance();
scriptOutputReader.initialize(scriptIn, hconf, conf.getScriptOutputInfo().getProperties());
outThread = new StreamThread(scriptOutputReader, new OutputStreamProcessor(scriptOutputDeserializer.getObjectInspector()), "OutputProcessor");
RecordReader scriptErrReader = conf.getErrRecordReaderClass().newInstance();
scriptErrReader.initialize(scriptErr, hconf, conf.getScriptErrInfo().getProperties());
errThread = new StreamThread(scriptErrReader, new ErrorStreamProcessor(HiveConf.getIntVar(hconf, HiveConf.ConfVars.SCRIPTERRORLIMIT)), "ErrorProcessor");
if (HiveConf.getBoolVar(hconf, HiveConf.ConfVars.HIVESCRIPTAUTOPROGRESS)) {
autoProgressor = new AutoProgressor(this.getClass().getName(), reporter, Utilities.getDefaultNotificationInterval(hconf), HiveConf.getTimeVar(hconf, HiveConf.ConfVars.HIVES_AUTO_PROGRESS_TIMEOUT, TimeUnit.MILLISECONDS));
autoProgressor.go();
}
outThread.start();
errThread.start();
} catch (Exception e) {
throw new HiveException(ErrorMsg.SCRIPT_INIT_ERROR.getErrorCodedMsg(), e);
}
}
if (scriptError != null) {
throw new HiveException(ErrorMsg.SCRIPT_GENERIC_ERROR.getErrorCodedMsg(), scriptError);
}
try {
Writable res = scriptInputSerializer.serialize(row, inputObjInspectors[tag]);
scriptOutWriter.write(res);
} catch (SerDeException e) {
LOG.error("Error in serializing the row: " + e.getMessage());
scriptError = e;
serialize_error_count.set(serialize_error_count.get() + 1);
throw new HiveException(e);
} catch (IOException e) {
if (isBrokenPipeException(e) && allowPartialConsumption()) {
// Give the outThread a chance to finish before marking the operator as done
try {
scriptPid.waitFor();
} catch (InterruptedException interruptedException) {
}
// as done
try {
if (outThread != null) {
outThread.join(0);
}
} catch (Exception e2) {
LOG.warn("Exception in closing outThread: " + StringUtils.stringifyException(e2));
}
setDone(true);
LOG.warn("Got broken pipe during write: ignoring exception and setting operator to done");
} else {
LOG.error("Error in writing to script: " + e.getMessage());
if (isBrokenPipeException(e)) {
displayBrokenPipeInfo();
}
scriptError = e;
throw new HiveException(ErrorMsg.SCRIPT_IO_ERROR.getErrorCodedMsg(), e);
}
}
}
use of org.apache.spark.SparkConf in project hive by apache.
the class SparkSessionImpl method getMemoryAndCores.
@Override
public ObjectPair<Long, Integer> getMemoryAndCores() throws Exception {
SparkConf sparkConf = hiveSparkClient.getSparkConf();
int numExecutors = hiveSparkClient.getExecutorCount();
// at start-up, we may be unable to get number of executors
if (numExecutors <= 0) {
return new ObjectPair<Long, Integer>(-1L, -1);
}
int executorMemoryInMB = Utils.memoryStringToMb(sparkConf.get("spark.executor.memory", "512m"));
double memoryFraction = 1.0 - sparkConf.getDouble("spark.storage.memoryFraction", 0.6);
long totalMemory = (long) (numExecutors * executorMemoryInMB * memoryFraction * 1024 * 1024);
int totalCores;
String masterURL = sparkConf.get("spark.master");
if (masterURL.startsWith("spark")) {
totalCores = sparkConf.contains("spark.default.parallelism") ? sparkConf.getInt("spark.default.parallelism", 1) : hiveSparkClient.getDefaultParallelism();
totalCores = Math.max(totalCores, numExecutors);
} else {
int coresPerExecutor = sparkConf.getInt("spark.executor.cores", 1);
totalCores = numExecutors * coresPerExecutor;
}
totalCores = totalCores / sparkConf.getInt("spark.task.cpus", 1);
long memoryPerTaskInBytes = totalMemory / totalCores;
LOG.info("Spark cluster current has executors: " + numExecutors + ", total cores: " + totalCores + ", memory per executor: " + executorMemoryInMB + "M, memoryFraction: " + memoryFraction);
return new ObjectPair<Long, Integer>(Long.valueOf(memoryPerTaskInBytes), Integer.valueOf(totalCores));
}
use of org.apache.spark.SparkConf in project learning-spark by databricks.
the class BasicQueryCassandra method main.
public static void main(String[] args) throws Exception {
if (args.length != 2) {
throw new Exception("Usage BasicLoadJson [sparkMaster] [cassandraHost]");
}
String sparkMaster = args[0];
String cassandraHost = args[1];
SparkConf conf = new SparkConf(true).set("spark.cassandra.connection.host", cassandraHost);
JavaSparkContext sc = new JavaSparkContext(sparkMaster, "basicquerycassandra", conf);
// entire table as an RDD
// assumes your table test was created as CREATE TABLE test.kv(key text PRIMARY KEY, value int);
JavaRDD<CassandraRow> data = javaFunctions(sc).cassandraTable("test", "kv");
// print some basic stats
System.out.println(data.mapToDouble(new DoubleFunction<CassandraRow>() {
public double call(CassandraRow row) {
return row.getInt("value");
}
}).stats());
// write some basic data to Cassandra
ArrayList<KeyValue> input = new ArrayList<KeyValue>();
input.add(KeyValue.newInstance("mostmagic", 3));
JavaRDD<KeyValue> kvRDD = sc.parallelize(input);
javaFunctions(kvRDD, KeyValue.class).saveToCassandra("test", "kv");
}
use of org.apache.spark.SparkConf in project learning-spark by databricks.
the class SparkSQLTwitter method main.
public static void main(String[] args) {
String inputFile = args[0];
SparkConf conf = new SparkConf();
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlCtx = new SQLContext(sc);
DataFrame input = sqlCtx.jsonFile(inputFile);
// Print the schema
input.printSchema();
// Register the input schema RDD
input.registerTempTable("tweets");
// Select tweets based on the retweetCount
DataFrame topTweets = sqlCtx.sql("SELECT text, retweetCount FROM tweets ORDER BY retweetCount LIMIT 10");
Row[] result = topTweets.collect();
for (Row row : result) {
System.out.println(row.get(0));
}
JavaRDD<String> topTweetText = topTweets.toJavaRDD().map(new Function<Row, String>() {
public String call(Row row) {
return row.getString(0);
}
});
System.out.println(topTweetText.collect());
// Create a person and turn it into a Schema RDD
ArrayList<HappyPerson> peopleList = new ArrayList<HappyPerson>();
peopleList.add(new HappyPerson("holden", "coffee"));
JavaRDD<HappyPerson> happyPeopleRDD = sc.parallelize(peopleList);
DataFrame happyPeopleSchemaRDD = sqlCtx.applySchema(happyPeopleRDD, HappyPerson.class);
happyPeopleSchemaRDD.registerTempTable("happy_people");
sqlCtx.udf().register("stringLengthJava", new UDF1<String, Integer>() {
@Override
public Integer call(String str) throws Exception {
return str.length();
}
}, DataTypes.IntegerType);
DataFrame tweetLength = sqlCtx.sql("SELECT stringLengthJava('text') FROM tweets LIMIT 10");
Row[] lengths = tweetLength.collect();
for (Row row : result) {
System.out.println(row.get(0));
}
sc.stop();
}
use of org.apache.spark.SparkConf in project learning-spark by databricks.
the class WordCount method main.
public static void main(String[] args) throws Exception {
String inputFile = args[0];
String outputFile = args[1];
// Create a Java Spark Context.
SparkConf conf = new SparkConf().setAppName("wordCount");
JavaSparkContext sc = new JavaSparkContext(conf);
// Load our input data.
JavaRDD<String> input = sc.textFile(inputFile);
// Split up into words.
JavaRDD<String> words = input.flatMap(new FlatMapFunction<String, String>() {
public Iterable<String> call(String x) {
return Arrays.asList(x.split(" "));
}
});
// Transform into word and count.
JavaPairRDD<String, Integer> counts = words.mapToPair(new PairFunction<String, String, Integer>() {
public Tuple2<String, Integer> call(String x) {
return new Tuple2(x, 1);
}
}).reduceByKey(new Function2<Integer, Integer, Integer>() {
public Integer call(Integer x, Integer y) {
return x + y;
}
});
// Save the word count back out to a text file, causing evaluation.
counts.saveAsTextFile(outputFile);
}
Aggregations